From 35476da35777360cf988b707bf9f9005e7b037d5 Mon Sep 17 00:00:00 2001
From: Matt <matthew.morley.ca@gmail.com>
Date: Sun, 29 Dec 2024 09:24:01 -0800
Subject: [PATCH 1/4] Add extra eigen headers for gtsam

---
 upstream_utils/eigen.py                       |    3 +-
 .../thirdparty/eigen/include/Eigen/Dense      |    7 +
 .../include/unsupported/Eigen/CXX11/Tensor    |  142 ++
 .../unsupported/Eigen/CXX11/TensorSymmetry    |   40 +
 .../CXX11/src/Tensor/InternalHeaderCheck.h    |    3 +
 .../Eigen/CXX11/src/Tensor/Tensor.h           |  382 ++++
 .../Eigen/CXX11/src/Tensor/TensorArgMax.h     |  284 +++
 .../Eigen/CXX11/src/Tensor/TensorAssign.h     |  213 ++
 .../Eigen/CXX11/src/Tensor/TensorBase.h       | 1244 ++++++++++
 .../Eigen/CXX11/src/Tensor/TensorBlock.h      | 1474 ++++++++++++
 .../CXX11/src/Tensor/TensorBroadcasting.h     | 1001 ++++++++
 .../Eigen/CXX11/src/Tensor/TensorChipping.h   |  474 ++++
 .../CXX11/src/Tensor/TensorConcatenation.h    |  354 +++
 .../CXX11/src/Tensor/TensorContraction.h      |  962 ++++++++
 .../src/Tensor/TensorContractionBlocking.h    |   69 +
 .../CXX11/src/Tensor/TensorContractionCuda.h  |    7 +
 .../CXX11/src/Tensor/TensorContractionGpu.h   | 1387 +++++++++++
 .../src/Tensor/TensorContractionMapper.h      |  533 +++++
 .../CXX11/src/Tensor/TensorContractionSycl.h  | 1654 ++++++++++++++
 .../src/Tensor/TensorContractionThreadPool.h  | 1552 +++++++++++++
 .../Eigen/CXX11/src/Tensor/TensorConversion.h |  416 ++++
 .../CXX11/src/Tensor/TensorConvolution.h      | 1123 +++++++++
 .../CXX11/src/Tensor/TensorConvolutionSycl.h  |  546 +++++
 .../Eigen/CXX11/src/Tensor/TensorCostModel.h  |  190 ++
 .../Eigen/CXX11/src/Tensor/TensorCustomOp.h   |  309 +++
 .../Eigen/CXX11/src/Tensor/TensorDevice.h     |  139 ++
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h |    7 +
 .../CXX11/src/Tensor/TensorDeviceDefault.h    |  113 +
 .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h  |  395 ++++
 .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h |  567 +++++
 .../CXX11/src/Tensor/TensorDeviceThreadPool.h |  376 +++
 .../CXX11/src/Tensor/TensorDimensionList.h    |  119 +
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h |  329 +++
 .../Eigen/CXX11/src/Tensor/TensorEvalTo.h     |  196 ++
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h  |  859 +++++++
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   |  672 ++++++
 .../Eigen/CXX11/src/Tensor/TensorExpr.h       |  329 +++
 .../Eigen/CXX11/src/Tensor/TensorFFT.h        |  674 ++++++
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h  |  226 ++
 .../Eigen/CXX11/src/Tensor/TensorForcedEval.h |  233 ++
 .../src/Tensor/TensorForwardDeclarations.h    |  215 ++
 .../Eigen/CXX11/src/Tensor/TensorFunctors.h   |  422 ++++
 .../Eigen/CXX11/src/Tensor/TensorGenerator.h  |  271 +++
 .../CXX11/src/Tensor/TensorGlobalFunctions.h  |   36 +
 .../src/Tensor/TensorGpuHipCudaDefines.h      |  101 +
 .../src/Tensor/TensorGpuHipCudaUndefines.h    |   45 +
 .../Eigen/CXX11/src/Tensor/TensorIO.h         |  413 ++++
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h |  590 +++++
 .../Eigen/CXX11/src/Tensor/TensorIndexList.h  |  620 +++++
 .../Eigen/CXX11/src/Tensor/TensorInflation.h  |  226 ++
 .../CXX11/src/Tensor/TensorInitializer.h      |   78 +
 .../Eigen/CXX11/src/Tensor/TensorIntDiv.h     |  261 +++
 .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h |  185 ++
 .../Eigen/CXX11/src/Tensor/TensorMacros.h     |   85 +
 .../Eigen/CXX11/src/Tensor/TensorMap.h        |  191 ++
 .../Eigen/CXX11/src/Tensor/TensorMeta.h       |  292 +++
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h   |  984 ++++++++
 .../Eigen/CXX11/src/Tensor/TensorPadding.h    |  620 +++++
 .../Eigen/CXX11/src/Tensor/TensorPatch.h      |  260 +++
 .../Eigen/CXX11/src/Tensor/TensorRandom.h     |  309 +++
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  | 1038 +++++++++
 .../CXX11/src/Tensor/TensorReductionGpu.h     |  958 ++++++++
 .../CXX11/src/Tensor/TensorReductionSycl.h    |  588 +++++
 .../Eigen/CXX11/src/Tensor/TensorRef.h        |  317 +++
 .../Eigen/CXX11/src/Tensor/TensorReverse.h    |  410 ++++
 .../Eigen/CXX11/src/Tensor/TensorRoll.h       |  361 +++
 .../Eigen/CXX11/src/Tensor/TensorScan.h       |  474 ++++
 .../Eigen/CXX11/src/Tensor/TensorScanSycl.h   |  506 ++++
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h  |  415 ++++
 .../Eigen/CXX11/src/Tensor/TensorStorage.h    |  144 ++
 .../Eigen/CXX11/src/Tensor/TensorStriding.h   |  316 +++
 .../Eigen/CXX11/src/Tensor/TensorTrace.h      |  278 +++
 .../Eigen/CXX11/src/Tensor/TensorTraits.h     |  231 ++
 .../Eigen/CXX11/src/Tensor/TensorUInt128.h    |  229 ++
 .../CXX11/src/Tensor/TensorVolumePatch.h      |  619 +++++
 .../src/TensorSymmetry/DynamicSymmetry.h      |  296 +++
 .../src/TensorSymmetry/InternalHeaderCheck.h  |    4 +
 .../CXX11/src/TensorSymmetry/StaticSymmetry.h |  223 ++
 .../Eigen/CXX11/src/TensorSymmetry/Symmetry.h |  335 +++
 .../TensorSymmetry/util/TemplateGroupTheory.h |  492 ++++
 .../unsupported/Eigen/SpecialFunctions        |  104 +
 .../KroneckerProduct/KroneckerTensorProduct.h |  284 +++
 .../BesselFunctionsArrayAPI.h                 |  276 +++
 .../BesselFunctionsBFloat16.h                 |   71 +
 .../BesselFunctionsFunctors.h                 |  323 +++
 .../SpecialFunctions/BesselFunctionsHalf.h    |   69 +
 .../SpecialFunctions/BesselFunctionsImpl.h    | 1638 +++++++++++++
 .../BesselFunctionsPacketMath.h               |  108 +
 .../SpecialFunctions/HipVectorCompatibility.h |   71 +
 .../SpecialFunctions/InternalHeaderCheck.h    |    4 +
 .../SpecialFunctionsArrayAPI.h                |  159 ++
 .../SpecialFunctionsBFloat16.h                |   73 +
 .../SpecialFunctionsFunctors.h                |  326 +++
 .../SpecialFunctions/SpecialFunctionsHalf.h   |   73 +
 .../SpecialFunctions/SpecialFunctionsImpl.h   | 2030 +++++++++++++++++
 .../SpecialFunctionsPacketMath.h              |  112 +
 .../arch/AVX/BesselFunctions.h                |   46 +
 .../arch/AVX/SpecialFunctions.h               |   16 +
 .../arch/AVX512/BesselFunctions.h             |   46 +
 .../arch/AVX512/SpecialFunctions.h            |   16 +
 .../arch/GPU/SpecialFunctions.h               |  317 +++
 .../arch/NEON/BesselFunctions.h               |   54 +
 .../arch/NEON/SpecialFunctions.h              |   34 +
 103 files changed, 40290 insertions(+), 1 deletion(-)
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/SpecialFunctions
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/InternalHeaderCheck.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
 create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h

diff --git a/upstream_utils/eigen.py b/upstream_utils/eigen.py
index 875c6dfbbae..0974b1b6b56 100755
--- a/upstream_utils/eigen.py
+++ b/upstream_utils/eigen.py
@@ -49,6 +49,7 @@ def eigen_inclusions(dp, f):
     modules = [
         "Cholesky",
         "Core",
+        "Dense",
         "Eigenvalues",
         "Geometry",
         "Householder",
@@ -86,7 +87,7 @@ def unsupported_inclusions(dp, f):
         return False
 
     # Include the MatrixFunctions module
-    return "MatrixFunctions" in abspath
+    return "MatrixFunctions" in abspath or "SpecialFunctions" in abspath or "Tensor" in abspath or abspath.endswith("SpecialFunctionsFunctors.h")
 
 
 def copy_upstream_src(wpilib_root):
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense b/wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense
new file mode 100644
index 00000000000..5768910bd88
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense
@@ -0,0 +1,7 @@
+#include "Core"
+#include "LU"
+#include "Cholesky"
+#include "QR"
+#include "SVD"
+#include "Geometry"
+#include "Eigenvalues"
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor
new file mode 100644
index 00000000000..6ac89f1b67e
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor
@@ -0,0 +1,142 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// #ifndef EIGEN_CXX11_TENSOR_MODULE_H
+#define EIGEN_CXX11_TENSOR_MODULE_H
+
+#include "../../../Eigen/Core"
+
+#include "../SpecialFunctions"
+
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// IWYU pragma: begin_exports
+#include "../../../Eigen/src/Core/util/Meta.h"
+#include "../../../Eigen/src/Core/util/MaxSizeVector.h"
+// IWYU pragma: end_exports
+
+/** \defgroup CXX11_Tensor_Module Tensor Module
+ *
+ * This module provides a Tensor class for storing arbitrarily indexed
+ * objects.
+ *
+ * \code
+ * #include <Eigen/CXX11/Tensor>
+ * \endcode
+ *
+ * Much of the documentation can be found \ref eigen_tensors "here".
+ */
+
+#include <atomic>
+#include <chrono>
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include <numeric>
+#include <random>
+#include <thread>
+
+#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
+// #include "ThreadPool"
+#endif
+
+#ifdef EIGEN_USE_GPU
+#include <iostream>
+#if defined(EIGEN_USE_HIP)
+#include <hip/hip_runtime.h>
+#else
+#include <cuda_runtime.h>
+#endif
+#endif
+
+// IWYU pragma: begin_exports
+#include "src/Tensor/TensorMacros.h"
+#include "src/Tensor/TensorForwardDeclarations.h"
+#include "src/Tensor/TensorMeta.h"
+#include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorCostModel.h"
+#include "src/Tensor/TensorDeviceDefault.h"
+#include "src/Tensor/TensorDeviceThreadPool.h"
+#include "src/Tensor/TensorDeviceGpu.h"
+#ifndef gpu_assert
+#define gpu_assert(x)
+#endif
+#include "src/Tensor/TensorDeviceSycl.h"
+#include "src/Tensor/TensorIndexList.h"
+#include "src/Tensor/TensorDimensionList.h"
+#include "src/Tensor/TensorDimensions.h"
+#include "src/Tensor/TensorInitializer.h"
+#include "src/Tensor/TensorTraits.h"
+#include "src/Tensor/TensorRandom.h"
+#include "src/Tensor/TensorUInt128.h"
+#include "src/Tensor/TensorIntDiv.h"
+#include "src/Tensor/TensorGlobalFunctions.h"
+
+#include "src/Tensor/TensorIO.h"
+
+#include "src/Tensor/TensorBase.h"
+#include "src/Tensor/TensorBlock.h"
+
+#include "src/Tensor/TensorEvaluator.h"
+#include "src/Tensor/TensorExpr.h"
+#include "src/Tensor/TensorReduction.h"
+#include "src/Tensor/TensorReductionGpu.h"
+#include "src/Tensor/TensorArgMax.h"
+#include "src/Tensor/TensorConcatenation.h"
+#include "src/Tensor/TensorContractionMapper.h"
+#include "src/Tensor/TensorContractionBlocking.h"
+#include "src/Tensor/TensorContraction.h"
+#include "src/Tensor/TensorContractionThreadPool.h"
+#include "src/Tensor/TensorContractionGpu.h"
+#include "src/Tensor/TensorConversion.h"
+#include "src/Tensor/TensorConvolution.h"
+#include "src/Tensor/TensorFFT.h"
+#include "src/Tensor/TensorPatch.h"
+#include "src/Tensor/TensorImagePatch.h"
+#include "src/Tensor/TensorVolumePatch.h"
+#include "src/Tensor/TensorBroadcasting.h"
+#include "src/Tensor/TensorChipping.h"
+#include "src/Tensor/TensorInflation.h"
+#include "src/Tensor/TensorLayoutSwap.h"
+#include "src/Tensor/TensorMorphing.h"
+#include "src/Tensor/TensorPadding.h"
+#include "src/Tensor/TensorReverse.h"
+#include "src/Tensor/TensorRoll.h"
+#include "src/Tensor/TensorShuffling.h"
+#include "src/Tensor/TensorStriding.h"
+#include "src/Tensor/TensorCustomOp.h"
+#include "src/Tensor/TensorEvalTo.h"
+#include "src/Tensor/TensorForcedEval.h"
+#include "src/Tensor/TensorGenerator.h"
+#include "src/Tensor/TensorAssign.h"
+#include "src/Tensor/TensorScan.h"
+#include "src/Tensor/TensorTrace.h"
+
+#ifdef EIGEN_USE_SYCL
+#include "src/Tensor/TensorReductionSycl.h"
+#include "src/Tensor/TensorConvolutionSycl.h"
+#include "src/Tensor/TensorContractionSycl.h"
+#include "src/Tensor/TensorScanSycl.h"
+#endif
+
+#include "src/Tensor/TensorExecutor.h"
+#include "src/Tensor/TensorDevice.h"
+
+#include "src/Tensor/TensorStorage.h"
+#include "src/Tensor/Tensor.h"
+#include "src/Tensor/TensorFixedSize.h"
+#include "src/Tensor/TensorMap.h"
+#include "src/Tensor/TensorRef.h"
+// IWYU pragma: end_exports
+
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+// #endif // EIGEN_CXX11_TENSOR_MODULE_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry
new file mode 100644
index 00000000000..d4a44d0a704
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
+#define EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
+
+#include "Tensor"
+
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// #include "src/util/CXX11Meta.h"
+
+/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
+ *
+ * This module provides a classes that allow for the definition of
+ * symmetries w.r.t. tensor indices.
+ *
+ * Including this module will implicitly include the Tensor module.
+ *
+ * \code
+ * #include <Eigen/TensorSymmetry>
+ * \endcode
+ */
+
+// IWYU pragma: begin_exports
+#include "src/TensorSymmetry/util/TemplateGroupTheory.h"
+#include "src/TensorSymmetry/Symmetry.h"
+#include "src/TensorSymmetry/StaticSymmetry.h"
+#include "src/TensorSymmetry/DynamicSymmetry.h"
+// IWYU pragma: end_exports
+
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h
new file mode 100644
index 00000000000..9e4c1ed9c37
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_CXX11_TENSOR_MODULE_H
+#error "Please include unsupported/Eigen/CXX11/Tensor instead of including headers inside the src directory directly."
+#endif
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
new file mode 100644
index 00000000000..9dc95916c0c
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -0,0 +1,382 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class Tensor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor class.
+ *
+ * The %Tensor class is the work-horse for all \em dense tensors within Eigen.
+ *
+ * The %Tensor class encompasses only dynamic-size objects so far.
+ *
+ * The first two template parameters are required:
+ * \tparam Scalar_  Numeric type, e.g. float, double, int or `std::complex<float>`.
+ *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
+ * \tparam NumIndices_ Number of indices (i.e. rank of the tensor)
+ *
+ * The remaining template parameters are optional -- in most cases you don't have to worry about them.
+ * \tparam Options_  A combination of either \b #RowMajor or \b #ColMajor, and of either
+ *                 \b #AutoAlign or \b #DontAlign.
+ *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter
+ * controls alignment, which is required for vectorization. It defaults to aligning tensors. Note that tensors currently
+ * do not support any operations that profit from vectorization. Support for such operations (i.e. adding two tensors
+ * etc.) is planned.
+ *
+ * You can access elements of tensors using normal subscripting:
+ *
+ * \code
+ * Eigen::Tensor<double, 4> t(10, 10, 10, 10);
+ * t(0, 1, 2, 3) = 42.0;
+ * \endcode
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN,
+ * \c EIGEN_TENSORBASE_PLUGIN, and \c EIGEN_READONLY_TENSORBASE_PLUGIN.
+ *
+ * <i><b>Some notes:</b></i>
+ *
+ * <dl>
+ * <dt><b>Relation to other parts of Eigen:</b></dt>
+ * <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
+ * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
+ * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the
+ * %Tensor class does not provide any of these features and is only available as a stand-alone class that just allows
+ * for coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to
+ * change dramatically.</dd>
+ * </dl>
+ *
+ * \ref TopicStorageOrders
+ */
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > {
+ public:
+  typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self;
+  typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base;
+  typedef typename Eigen::internal::nested<Self>::type Nested;
+  typedef typename internal::traits<Self>::StorageKind StorageKind;
+  typedef typename internal::traits<Self>::Index Index;
+  typedef Scalar_ Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename Base::CoeffReturnType CoeffReturnType;
+
+  enum { IsAligned = (EIGEN_MAX_ALIGN_BYTES > 0) && !(Options_ & DontAlign), CoordAccess = true, RawAccess = true };
+
+  static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  static constexpr int Options = Options_;
+  static constexpr int NumIndices = NumIndices_;
+  typedef DSizes<Index, NumIndices_> Dimensions;
+
+ protected:
+  TensorStorage<Scalar, Dimensions, Options> m_storage;
+
+  template <typename CustomIndices>
+  struct isOfNormalIndex {
+    static const bool is_array = internal::is_base_of<array<Index, NumIndices>, CustomIndices>::value;
+    static const bool is_int = NumTraits<CustomIndices>::IsInteger;
+    static const bool value = is_array | is_int;
+  };
+
+ public:
+  // Metadata
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_storage.data(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_storage.data(); }
+
+  // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+  // work, because that uses base().coeffRef() - and we don't yet
+  // implement a similar class hierarchy
+  inline Self& base() { return *this; }
+  inline const Self& base() const { return *this; }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex,
+                                                            IndexTypes... otherIndices) const {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+  }
+
+  // normal indices
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const {
+    eigen_internal_assert(checkIndexRange(indices));
+    return m_storage.data()[linearizedIndex(indices)];
+  }
+
+  // custom indices
+  template <typename CustomIndices, EIGEN_SFINAE_ENABLE_IF(!(isOfNormalIndex<CustomIndices>::value))>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const {
+    return coeff(internal::customIndices2Array<Index, NumIndices>(indices));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return m_storage.data()[0];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_storage.data()[index];
+  }
+
+  template <typename... IndexTypes>
+  inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+  }
+
+  // normal indices
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) {
+    eigen_internal_assert(checkIndexRange(indices));
+    return m_storage.data()[linearizedIndex(indices)];
+  }
+
+  // custom indices
+  template <typename CustomIndices, EIGEN_SFINAE_ENABLE_IF(!(isOfNormalIndex<CustomIndices>::value))>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices) {
+    return coeffRef(internal::customIndices2Array<Index, NumIndices>(indices));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return m_storage.data()[0];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_storage.data()[index];
+  }
+
+  template <typename... IndexTypes>
+  inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+  }
+
+  // custom indices
+  template <typename CustomIndices, EIGEN_SFINAE_ENABLE_IF(!(isOfNormalIndex<CustomIndices>::value))>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const {
+    return coeff(internal::customIndices2Array<Index, NumIndices>(indices));
+  }
+
+  // normal indices
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const {
+    return coeff(indices);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const {
+    eigen_internal_assert(index >= 0 && index < size());
+    return coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeff();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const {
+    // The bracket operator is only for vectors, use the parenthesis operator instead.
+    EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeff(index);
+  }
+
+  template <typename... IndexTypes>
+  inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+  }
+
+  // normal indices
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) {
+    return coeffRef(indices);
+  }
+
+  // custom indices
+  template <typename CustomIndices, EIGEN_SFINAE_ENABLE_IF(!(isOfNormalIndex<CustomIndices>::value))>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices) {
+    return coeffRef(internal::customIndices2Array<Index, NumIndices>(indices));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) {
+    eigen_assert(index >= 0 && index < size());
+    return coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeffRef();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) {
+    // The bracket operator is only for vectors, use the parenthesis operator instead
+    EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor() : m_storage() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const Self& other) : Base(other), m_storage(other.m_storage) {}
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
+      : m_storage(firstDimension, otherDimensions...) {
+    // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+
+  /** Normal Dimension */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array<Index, NumIndices>& dimensions)
+      : m_storage(internal::array_prod(dimensions), dimensions) {
+    EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+  }
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) {
+    EIGEN_STATIC_ASSERT(OtherDerived::NumDimensions == Base::NumDimensions, Number_of_dimensions_must_match)
+    typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+    Assign assign(*this, other.derived());
+    resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+  }
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other) {
+    EIGEN_STATIC_ASSERT(OtherDerived::NumDimensions == Base::NumDimensions, Number_of_dimensions_must_match)
+    typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+    Assign assign(*this, other.derived());
+    resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Self&& other) : m_storage(std::move(other.m_storage)) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(Self&& other) {
+    m_storage = std::move(other.m_storage);
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) {
+    typedef TensorAssignOp<Tensor, const Tensor> Assign;
+    Assign assign(*this, other);
+    resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    return *this;
+  }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+    Assign assign(*this, other);
+    resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    return *this;
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC void resize(Index firstDimension, IndexTypes... otherDimensions) {
+    // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+  }
+
+  /** Normal Dimension */
+  EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions) {
+#ifndef EIGEN_NO_DEBUG
+    Index size = Index(1);
+    for (int i = 0; i < NumIndices; i++) {
+      internal::check_rows_cols_for_overflow<Dynamic, Dynamic, Dynamic>::run(size, dimensions[i]);
+      size *= dimensions[i];
+    }
+#else
+    Index size = internal::array_prod(dimensions);
+#endif
+
+#ifdef EIGEN_INITIALIZE_COEFFS
+    bool size_changed = size != this->size();
+    m_storage.resize(size, dimensions);
+    if (size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+#else
+    m_storage.resize(size, dimensions);
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC void resize() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    // Nothing to do: rank 0 tensors have fixed size
+  }
+
+  template <typename FirstType, typename... OtherTypes>
+  EIGEN_DEVICE_FUNC void resize(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+    array<Index, NumIndices> dims;
+    for (int i = 0; i < NumIndices; ++i) {
+      dims[i] = static_cast<Index>(dimensions[i]);
+    }
+    resize(dims);
+  }
+
+  /** Custom Dimension */
+  template <typename CustomDimension, EIGEN_SFINAE_ENABLE_IF(!(isOfNormalIndex<CustomDimension>::value))>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions) {
+    resize(internal::customIndices2Array<Index, NumIndices>(dimensions));
+  }
+
+  template <typename std::ptrdiff_t... Indices>
+  EIGEN_DEVICE_FUNC void resize(const Sizes<Indices...>& dimensions) {
+    array<Index, NumIndices> dims;
+    for (int i = 0; i < NumIndices; ++i) {
+      dims[i] = static_cast<Index>(dimensions[i]);
+    }
+    resize(dims);
+  }
+
+#ifdef EIGEN_TENSOR_PLUGIN
+#include EIGEN_TENSOR_PLUGIN
+#endif
+
+ protected:
+  bool checkIndexRange(const array<Index, NumIndices>& indices) const {
+    using internal::array_apply_and_reduce;
+    using internal::array_zip_and_reduce;
+    using internal::greater_equal_zero_op;
+    using internal::lesser_op;
+    using internal::logical_and_op;
+
+    return
+        // check whether the indices are all >= 0
+        array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+        // check whether the indices fit in the dimensions
+        array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const {
+    if (Options & RowMajor) {
+      return m_storage.dimensions().IndexOfRowMajor(indices);
+    } else {
+      return m_storage.dimensions().IndexOfColMajor(indices);
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
new file mode 100644
index 00000000000..f272354c4c5
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -0,0 +1,284 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+/** \class TensorIndexPair
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor + Index Pair class.
+ *
+ *
+ */
+template <typename XprType>
+struct traits<TensorIndexPairOp<XprType>> : public traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef Pair<Index, typename XprTraits::Scalar> Scalar;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+};
+
+template <typename XprType>
+struct eval<TensorIndexPairOp<XprType>, Eigen::Dense> {
+  typedef const TensorIndexPairOp<XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename XprType>
+struct nested<TensorIndexPairOp<XprType>, 1, typename eval<TensorIndexPairOp<XprType>>::type> {
+  typedef TensorIndexPairOp<XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename XprType>
+class TensorIndexPairOp : public TensorBase<TensorIndexPairOp<XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorIndexPairOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename Eigen::internal::nested<TensorIndexPairOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorIndexPairOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorIndexPairOp>::Index Index;
+  typedef Pair<Index, typename XprType::CoeffReturnType> CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexPairOp(const XprType& expr) : m_xpr(expr) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+};
+
+// Eval as rvalue
+template <typename ArgType, typename Device>
+struct TensorEvaluator<const TensorIndexPairOp<ArgType>, Device> {
+  typedef TensorIndexPairOp<ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  static constexpr int NumDims = internal::array_size<Dimensions>::value;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return CoeffReturnType(index, m_impl.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+namespace internal {
+
+/** \class TensorPairIndex
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Converts to Tensor<Pair<Index, Scalar> > and reduces to Tensor<Index>.
+ *
+ */
+template <typename ReduceOp, typename Dims, typename XprType>
+struct traits<TensorPairReducerOp<ReduceOp, Dims, XprType>> : public traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef Index Scalar;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static constexpr int Layout = XprTraits::Layout;
+};
+
+template <typename ReduceOp, typename Dims, typename XprType>
+struct eval<TensorPairReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense> {
+  typedef const TensorPairReducerOp<ReduceOp, Dims, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename ReduceOp, typename Dims, typename XprType>
+struct nested<TensorPairReducerOp<ReduceOp, Dims, XprType>, 1,
+              typename eval<TensorPairReducerOp<ReduceOp, Dims, XprType>>::type> {
+  typedef TensorPairReducerOp<ReduceOp, Dims, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename ReduceOp, typename Dims, typename XprType>
+class TensorPairReducerOp : public TensorBase<TensorPairReducerOp<ReduceOp, Dims, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorPairReducerOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename Eigen::internal::nested<TensorPairReducerOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPairReducerOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPairReducerOp>::Index Index;
+  typedef Index CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPairReducerOp(const XprType& expr, const ReduceOp& reduce_op,
+                                                            const Index return_dim, const Dims& reduce_dims)
+      : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC const ReduceOp& reduce_op() const { return m_reduce_op; }
+
+  EIGEN_DEVICE_FUNC const Dims& reduce_dims() const { return m_reduce_dims; }
+
+  EIGEN_DEVICE_FUNC Index return_dim() const { return m_return_dim; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const ReduceOp m_reduce_op;
+  const Index m_return_dim;
+  const Dims m_reduce_dims;
+};
+
+// Eval as rvalue
+template <typename ReduceOp, typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPairReducerOp<ReduceOp, Dims, ArgType>, Device> {
+  typedef TensorPairReducerOp<ReduceOp, Dims, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename TensorIndexPairOp<ArgType>::CoeffReturnType PairType;
+  typedef typename TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexPairOp<ArgType>>,
+                                   Device>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<const TensorIndexPairOp<ArgType>, Device>::Dimensions InputDimensions;
+  static constexpr int NumDims = internal::array_size<InputDimensions>::value;
+  typedef array<Index, NumDims> StrideDims;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<PairType, Device> PairStorageMem;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+  static constexpr int Layout =
+      TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexPairOp<ArgType>>, Device>::Layout;
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_orig_impl(op.expression(), device),
+        m_impl(op.expression().index_pairs().reduce(op.reduce_dims(), op.reduce_op()), device),
+        m_return_dim(op.return_dim()) {
+    gen_strides(m_orig_impl.dimensions(), m_strides);
+    if (Layout == static_cast<int>(ColMajor)) {
+      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+      m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size;
+    } else {
+      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+      m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
+    }
+    // If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
+    m_stride_div =
+        ((m_return_dim >= 0) && (m_return_dim < static_cast<Index>(m_strides.size()))) ? m_strides[m_return_dim] : 1;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    const PairType v = m_impl.coeff(index);
+    return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost =
+        1.0 + (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
+    return m_orig_impl.costPerCoeff(vectorized) + m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
+  }
+
+ private:
+  EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
+    if (m_return_dim < 0) {
+      return;  // Won't be using the strides.
+    }
+    eigen_assert(m_return_dim < NumDims && "Asking to convert index to a dimension outside of the rank");
+
+    // Calculate m_stride_div and m_stride_mod, which are used to
+    // calculate the value of an index w.r.t. the m_return_dim.
+    if (Layout == static_cast<int>(ColMajor)) {
+      strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        strides[i] = strides[i - 1] * dims[i - 1];
+      }
+    } else {
+      strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        strides[i] = strides[i + 1] * dims[i + 1];
+      }
+    }
+  }
+
+ protected:
+  TensorEvaluator<const TensorIndexPairOp<ArgType>, Device> m_orig_impl;
+  TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexPairOp<ArgType>>, Device> m_impl;
+  const Index m_return_dim;
+  StrideDims m_strides;
+  Index m_stride_mod;
+  Index m_stride_div;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
new file mode 100644
index 00000000000..8a57576b7ba
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -0,0 +1,213 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorAssign
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor assignment class.
+ *
+ * This class is represents the assignment of the values resulting from the evaluation of
+ * the rhs expression to the memory locations denoted by the lhs expression.
+ */
+namespace internal {
+template <typename LhsXprType, typename RhsXprType>
+struct traits<TensorAssignOp<LhsXprType, RhsXprType> > {
+  typedef typename LhsXprType::Scalar Scalar;
+  typedef typename traits<LhsXprType>::StorageKind StorageKind;
+  typedef
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  static constexpr std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
+  static constexpr int Layout = internal::traits<LhsXprType>::Layout;
+  typedef typename traits<LhsXprType>::PointerType PointerType;
+
+  enum { Flags = 0 };
+};
+
+template <typename LhsXprType, typename RhsXprType>
+struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense> {
+  typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
+};
+
+template <typename LhsXprType, typename RhsXprType>
+struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type> {
+  typedef TensorAssignOp<LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+template <typename LhsXprType, typename RhsXprType>
+class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> > {
+ public:
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
+
+  static constexpr int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC internal::remove_all_t<typename LhsXprType::Nested>& lhsExpression() const {
+    return *((internal::remove_all_t<typename LhsXprType::Nested>*)&m_lhs_xpr);
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename RhsXprType::Nested>& rhsExpression() const {
+    return m_rhs_xpr;
+  }
+
+ protected:
+  internal::remove_all_t<typename LhsXprType::Nested>& m_lhs_xpr;
+  const internal::remove_all_t<typename RhsXprType::Nested>& m_rhs_xpr;
+};
+
+template <typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> {
+  typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static constexpr int NumDims = XprType::NumDims;
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+
+  enum {
+    IsAligned =
+        int(TensorEvaluator<LeftArgType, Device>::IsAligned) & int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                   int(TensorEvaluator<RightArgType, Device>::PacketAccess),
+    BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                  int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock RightTensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
+    // The dimensions of the lhs and the rhs tensors should be equal to prevent
+    // overflows and ensure the result is fully initialized.
+    // TODO: use left impl instead if right impl dimensions are known at compile time.
+    return m_rightImpl.dimensions();
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
+    // null value), attempt to evaluate the rhs expression in place. Returns true iff in place
+    // evaluation isn't supported and the caller still needs to manually assign the values generated
+    // by the rhs to the lhs.
+    return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(m_leftImpl.data(), [done](bool need_assign) { done(need_assign); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
+    m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
+    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+    m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
+  }
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_leftImpl.coeff(index); }
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return m_leftImpl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // We assume that evalPacket or evalScalar is called to perform the
+    // assignment and account for the cost of the write here, but reduce left
+    // cost by one load because we are using m_leftImpl.coeffRef.
+    TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
+    return m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), left.bytes_stored(),
+                        left.compute_cycles()) +
+           TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::merge(m_leftImpl.getResourceRequirements(),
+                                                            m_rightImpl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    if (TensorEvaluator<LeftArgType, Device>::RawAccess && m_leftImpl.data() != NULL) {
+      // If destination has raw data access, we pass it as a potential
+      // destination for a block descriptor evaluation.
+      desc.template AddDestinationBuffer<Layout>(
+          /*dst_base=*/m_leftImpl.data() + desc.offset(),
+          /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
+    }
+
+    RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true);
+    // If block was evaluated into a destination, there is no need to do assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      m_leftImpl.writeBlock(desc, block);
+    }
+    block.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
+
+ private:
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
new file mode 100644
index 00000000000..fc3f3b78a84
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -0,0 +1,1244 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+
+// clang-format off
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorBase
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor base class.
+  *
+  * This class is the common parent of the Tensor and TensorMap class, thus
+  * making it possible to use either class interchangeably in expressions.
+  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+// FIXME Doxygen does not like the inheritance with different template parameters
+// Since there is no doxygen documentation inside, we disable it for now
+template<typename Derived>
+class TensorBase<Derived, ReadOnlyAccessors>
+{
+  public:
+    typedef internal::traits<Derived> DerivedTraits;
+    typedef typename DerivedTraits::Scalar Scalar;
+    typedef typename DerivedTraits::Index Index;
+    typedef std::remove_const_t<Scalar> CoeffReturnType;
+    static constexpr int NumDimensions = DerivedTraits::NumDimensions;
+
+    // Generic nullary operation support.
+    template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived>
+    nullaryExpr(const CustomNullaryOp& func) const {
+      return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func);
+    }
+
+    // Coefficient-wise nullary operators
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
+    constant(const Scalar& value) const {
+      return nullaryExpr(internal::scalar_constant_op<Scalar>(value));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived>
+    random() const {
+      return nullaryExpr(internal::UniformRandomGenerator<Scalar>());
+    }
+    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived>
+    random(const RandomGenerator& gen = RandomGenerator()) const {
+      return nullaryExpr(gen);
+    }
+
+    // Tensor generation
+    template <typename Generator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorGeneratorOp<Generator, const Derived>
+    generate(const Generator& generator) const {
+      return TensorGeneratorOp<Generator, const Derived>(derived(), generator);
+    }
+
+    // Generic unary operation support.
+    template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived>
+    unaryExpr(const CustomUnaryOp& func) const {
+      return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
+    }
+
+    // Coefficient-wise unary operators
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived>
+    operator-() const {
+      return unaryExpr(internal::scalar_opposite_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+    sqrt() const {
+      return unaryExpr(internal::scalar_sqrt_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
+    sign() const {
+      return unaryExpr(internal::scalar_sign_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived>
+    rsqrt() const {
+      return unaryExpr(internal::scalar_rsqrt_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+    square() const {
+      return unaryExpr(internal::scalar_square_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+    cube() const {
+      return unaryExpr(internal::scalar_cube_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+    inverse() const {
+      return unaryExpr(internal::scalar_inverse_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived>
+    tanh() const {
+      return unaryExpr(internal::scalar_tanh_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived>
+    lgamma() const {
+      return unaryExpr(internal::scalar_lgamma_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived>
+    digamma() const {
+      return unaryExpr(internal::scalar_digamma_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0_op<Scalar>, const Derived>
+    bessel_i0() const {
+      return unaryExpr(internal::scalar_bessel_i0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0e_op<Scalar>, const Derived>
+    bessel_i0e() const {
+      return unaryExpr(internal::scalar_bessel_i0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1_op<Scalar>, const Derived>
+    bessel_i1() const {
+      return unaryExpr(internal::scalar_bessel_i1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1e_op<Scalar>, const Derived>
+    bessel_i1e() const {
+      return unaryExpr(internal::scalar_bessel_i1e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j0_op<Scalar>, const Derived>
+    bessel_j0() const {
+      return unaryExpr(internal::scalar_bessel_j0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y0_op<Scalar>, const Derived>
+    bessel_y0() const {
+      return unaryExpr(internal::scalar_bessel_y0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j1_op<Scalar>, const Derived>
+    bessel_j1() const {
+      return unaryExpr(internal::scalar_bessel_j1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y1_op<Scalar>, const Derived>
+    bessel_y1() const {
+      return unaryExpr(internal::scalar_bessel_y1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0_op<Scalar>, const Derived>
+    bessel_k0() const {
+      return unaryExpr(internal::scalar_bessel_k0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0e_op<Scalar>, const Derived>
+    bessel_k0e() const {
+      return unaryExpr(internal::scalar_bessel_k0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1_op<Scalar>, const Derived>
+    bessel_k1() const {
+      return unaryExpr(internal::scalar_bessel_k1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1e_op<Scalar>, const Derived>
+    bessel_k1e() const {
+      return unaryExpr(internal::scalar_bessel_k1e_op<Scalar>());
+    }
+
+    // igamma(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
+    igamma(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
+    }
+
+    // igamma_der_a(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_der_a_op<Scalar>, const Derived, const OtherDerived>
+    igamma_der_a(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op<Scalar>());
+    }
+
+    // gamma_sample_der_alpha(alpha = this, sample = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_gamma_sample_der_alpha_op<Scalar>, const Derived, const OtherDerived>
+    gamma_sample_der_alpha(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op<Scalar>());
+    }
+
+    // igammac(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
+    igammac(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igammac_op<Scalar>());
+    }
+
+    // zeta(x = this, q = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const OtherDerived>
+    zeta(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_zeta_op<Scalar>());
+    }
+
+    // polygamma(n = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const Derived, const OtherDerived>
+    polygamma(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_polygamma_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived>
+    erf() const {
+      return unaryExpr(internal::scalar_erf_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived>
+    erfc() const {
+      return unaryExpr(internal::scalar_erfc_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived>
+    ndtri() const {
+      return unaryExpr(internal::scalar_ndtri_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
+    sigmoid() const {
+      return unaryExpr(internal::scalar_logistic_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+    exp() const {
+      return unaryExpr(internal::scalar_exp_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
+    expm1() const {
+      return unaryExpr(internal::scalar_expm1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+    log() const {
+      return unaryExpr(internal::scalar_log_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+    log1p() const {
+      return unaryExpr(internal::scalar_log1p_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived>
+    log2() const {
+      return unaryExpr(internal::scalar_log2_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+    abs() const {
+      return unaryExpr(internal::scalar_abs_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived>
+    arg() const {
+      return unaryExpr(internal::scalar_arg_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived>
+    clip(Scalar min, Scalar max) const {
+      return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const std::conditional_t<NumTraits<CoeffReturnType>::IsComplex,
+                                                      TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
+                                                      Derived>
+    conjugate() const {
+      return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived());
+    }
+
+    template<typename ScalarExponent>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::enable_if_t<internal::is_arithmetic<typename NumTraits<ScalarExponent>::Real>::value,
+        TensorCwiseUnaryOp<internal::scalar_unary_pow_op<Scalar, ScalarExponent>, const Derived>>
+        pow(ScalarExponent exponent) const
+    {
+        return unaryExpr(internal::scalar_unary_pow_op<Scalar, ScalarExponent>(exponent));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+    real() const {
+      return unaryExpr(internal::scalar_real_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+    imag() const {
+      return unaryExpr(internal::scalar_imag_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
+    operator+ (Scalar rhs) const {
+      return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
+    operator+ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
+    operator- (Scalar rhs) const {
+      EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
+    operator- (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
+    operator* (Scalar rhs) const {
+      return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
+    operator* (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
+    operator/ (Scalar rhs) const {
+      return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
+    operator/ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_mod_op<Scalar>, const Derived>
+    operator% (Scalar rhs) const {
+      EIGEN_STATIC_ASSERT(NumTraits<Scalar>::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD);
+      return unaryExpr(internal::scalar_mod_op<Scalar>(rhs));
+    }
+
+    template <int NanPropagation=PropagateFast>
+    EIGEN_DEVICE_FUNC
+        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    cwiseMax(Scalar threshold) const {
+      return cwiseMax<NanPropagation>(constant(threshold));
+    }
+
+    template <int NanPropagation=PropagateFast>
+    EIGEN_DEVICE_FUNC
+        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    cwiseMin(Scalar threshold) const {
+      return cwiseMin<NanPropagation>(constant(threshold));
+    }
+
+    template<typename NewType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const std::conditional_t<internal::is_same<NewType, CoeffReturnType>::value,
+                                                      Derived,
+                                                      TensorConversionOp<NewType, const Derived> >
+    cast() const {
+      return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived()));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived>
+    round() const {
+      return unaryExpr(internal::scalar_round_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived>
+    rint() const {
+      return unaryExpr(internal::scalar_rint_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived>
+    ceil() const {
+      return unaryExpr(internal::scalar_ceil_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived>
+    floor() const {
+      return unaryExpr(internal::scalar_floor_op<Scalar>());
+    }
+
+    // Generic binary operation support.
+    template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
+    binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const {
+      return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func);
+    }
+
+    // Coefficient-wise binary operators.
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>
+    operator+(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>
+    operator-(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived>
+    operator*(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+    operator/(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
+    }
+
+    template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
+    cwiseMax(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
+    }
+
+    template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
+    cwiseMin(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
+    }
+
+    // logical operators
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived, const OtherDerived>
+    operator&&(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_and_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived, const OtherDerived>
+    operator||(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_or_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived, const OtherDerived>
+    operator&(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_bitwise_and_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived, const OtherDerived>
+    operator|(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_bitwise_or_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived, const OtherDerived>
+    operator^(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_bitwise_xor_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived>
+    operator!() const {
+      return unaryExpr(internal::scalar_boolean_not_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseUnaryOp<internal::scalar_bitwise_not_op<Scalar>, const Derived>
+    operator~() const {
+      return unaryExpr(internal::scalar_bitwise_not_op<Scalar>());
+    }
+
+    // Comparisons and tests.
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+    operator<(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+    operator<=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+    operator>(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+    operator>=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+    operator==(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+    operator!=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
+    }
+
+    // comparisons and tests for Scalars
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator<(Scalar threshold) const {
+      return operator<(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator<=(Scalar threshold) const {
+      return operator<=(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator>(Scalar threshold) const {
+      return operator>(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator>=(Scalar threshold) const {
+      return operator>=(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator==(Scalar threshold) const {
+      return operator==(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator!=(Scalar threshold) const {
+      return operator!=(constant(threshold));
+    }
+
+    // Predicates.
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorConversionOp<bool, const TensorCwiseUnaryOp<internal::scalar_isnan_op<Scalar, true>, const Derived>>
+    (isnan)() const {
+      return unaryExpr(internal::scalar_isnan_op<Scalar, true>()).template cast<bool>();
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorConversionOp<bool, const TensorCwiseUnaryOp<internal::scalar_isinf_op<Scalar, true>, const Derived>>
+    (isinf)() const {
+      return unaryExpr(internal::scalar_isinf_op<Scalar, true>()).template cast<bool>();
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorConversionOp<bool, const TensorCwiseUnaryOp<internal::scalar_isfinite_op<Scalar, true>, const Derived>>
+    (isfinite)() const {
+      return unaryExpr(internal::scalar_isfinite_op<Scalar, true>()).template cast<bool>();
+    }
+
+    // Coefficient-wise ternary operators.
+    template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
+    select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const {
+      return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
+    }
+
+    // Contractions.
+    typedef Eigen::IndexPair<Index> DimensionPair;
+
+    template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>
+    contract(const OtherDerived& other, const Dimensions& dims) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>(derived(), other.derived(), dims);
+    }
+
+    template<typename OtherDerived, typename Dimensions, typename OutputKernel> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>
+    contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>(derived(), other.derived(), dims, output_kernel);
+    }
+
+    // Convolutions.
+    template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>
+    convolve(const KernelDerived& kernel, const Dimensions& dims) const {
+      return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
+    }
+
+    // Fourier transforms
+    template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
+    fft(const FFT& dims) const {
+      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), dims);
+    }
+
+    // Scan.
+    typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanSumOp
+    cumsum(const Index& axis, bool exclusive = false) const {
+      return TensorScanSumOp(derived(), axis, exclusive);
+    }
+
+    typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanProdOp
+    cumprod(const Index& axis, bool exclusive = false) const {
+      return TensorScanProdOp(derived(), axis, exclusive);
+    }
+
+    template <typename Reducer>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanOp<Reducer, const Derived>
+    scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
+      return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
+    }
+
+    // Reductions.
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
+    sum(const Dims& dims) const {
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    sum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>
+    mean(const Dims& dims) const {
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    mean() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>
+    prod(const Dims& dims) const {
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    prod() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
+    maximum(const Dims& dims) const {
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <int NanPropagation=PropagateFast>
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
+    maximum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
+    minimum(const Dims& dims) const {
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <int NanPropagation=PropagateFast>
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
+    minimum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const Dims, const std::conditional_t<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> > >
+    all(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::AndReducer());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const std::conditional_t<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> > >
+    all() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::AndReducer());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const Dims, const std::conditional_t<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> > >
+    any(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::OrReducer());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const std::conditional_t<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> > >
+    any() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::OrReducer());
+    }
+
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPairReducerOp<
+      internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >,
+      const array<Index, NumDimensions>, const Derived>
+    argmax() const {
+      array<Index, NumDimensions> in_dims;
+      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      return TensorPairReducerOp<
+        internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >,
+        const array<Index, NumDimensions>,
+        const Derived>(derived(), internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >(), -1, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPairReducerOp<
+      internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >,
+      const array<Index, NumDimensions>, const Derived>
+    argmin() const {
+      array<Index, NumDimensions> in_dims;
+      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      return TensorPairReducerOp<
+        internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >,
+        const array<Index, NumDimensions>,
+        const Derived>(derived(), internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >(), -1, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPairReducerOp<
+      internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >,
+      const array<Index, 1>, const Derived>
+    argmax(const Index return_dim) const {
+      array<Index, 1> in_dims;
+      in_dims[0] = return_dim;
+      return TensorPairReducerOp<
+        internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >,
+        const array<Index, 1>,
+        const Derived>(derived(), internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >(), return_dim, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPairReducerOp<
+      internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >,
+      const array<Index, 1>, const Derived>
+    argmin(const Index return_dim) const {
+      array<Index, 1> in_dims;
+      in_dims[0] = return_dim;
+      return TensorPairReducerOp<
+        internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >,
+        const array<Index, 1>,
+        const Derived>(derived(), internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >(), return_dim, in_dims);
+    }
+
+    template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<Reducer, const Dims, const Derived>
+    reduce(const Dims& dims, const Reducer& reducer) const {
+      return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTraceOp<const Dims, const Derived>
+    trace(const Dims& dims) const {
+      return TensorTraceOp<const Dims, const Derived>(derived(), dims);
+    }
+
+    const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>
+    trace() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims);
+    }
+
+    template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorBroadcastingOp<const Broadcast, const Derived>
+    broadcast(const Broadcast& bcast) const {
+      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), bcast);
+    }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, Axis axis) const {
+      return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis);
+    }
+
+    template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPatchOp<const PatchDims, const Derived>
+    extract_patches(const PatchDims& patch_dims) const {
+      return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1,
+                          const Index row_stride = 1, const Index col_stride = 1,
+                          const Index in_row_stride = 1, const Index in_col_stride = 1,
+                          const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, 1, 1, padding_type, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const Index in_row_stride, const Index in_col_stride,
+                          const Index row_inflate_stride, const Index col_inflate_stride,
+                          const Index padding_top, const Index padding_bottom,
+                          const Index padding_left,const Index padding_right,
+                          const Scalar padding_value) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride,
+                                                                 padding_top, padding_bottom, padding_left, padding_right, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+                           const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
+                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
+      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
+    }
+
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+                           const Index plane_stride, const Index row_stride, const Index col_stride,
+                           const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
+                           const Index padding_top_z, const Index padding_bottom_z,
+                           const Index padding_top, const Index padding_bottom,
+                           const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const {
+      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
+    }
+
+    // Morphing operators.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorLayoutSwapOp<const Derived>
+    swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+    }
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                const Derived>(derived(), startIndices, stopIndices, strides);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<DimId, const Derived>
+    chip(const Index offset) const {
+      EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
+    template <typename Rolls> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorRollOp<const Rolls, const Derived>
+    roll(const Rolls& rolls) const {
+      return TensorRollOp<const Rolls, const Derived>(derived(), rolls);
+    }
+    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPaddingOp<const PaddingDimensions, const Derived>
+    pad(const PaddingDimensions& padding) const {
+      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, internal::scalar_cast_op<int, Scalar>()(0));
+    }
+    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPaddingOp<const PaddingDimensions, const Derived>
+    pad(const PaddingDimensions& padding, const Scalar padding_value) const {
+      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, padding_value);
+    }
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shfl) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorInflationOp<const Strides, const Derived>
+    inflate(const Strides& strides) const {
+      return TensorInflationOp<const Strides, const Derived>(derived(), strides);
+    }
+
+    // Returns a tensor containing index/value pairs
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorIndexPairOp<const Derived>
+    index_pairs() const {
+      return TensorIndexPairOp<const Derived>(derived());
+    }
+
+    // Support for custom unary and binary operations
+    template <typename CustomUnaryFunc>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const {
+      return TensorCustomUnaryOp<const CustomUnaryFunc, const Derived>(derived(), op);
+    }
+    template <typename OtherDerived, typename CustomBinaryFunc>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived> customOp(const OtherDerived& other, const CustomBinaryFunc& op) const {
+      return TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived>(derived(), other, op);
+    }
+
+    // Force the evaluation of the expression.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorForcedEvalOp<const Derived> eval() const {
+      return TensorForcedEvalOp<const Derived>(derived());
+    }
+
+    // Returns a formatted tensor ready for printing to a stream
+    template<typename Format>
+    inline const TensorWithFormat<Derived,DerivedTraits::Layout,DerivedTraits::NumDimensions, Format> format(const Format& fmt) const {
+      return TensorWithFormat<Derived,DerivedTraits::Layout,DerivedTraits::NumDimensions, Format>(derived(), fmt);
+    }
+
+    #ifdef EIGEN_READONLY_TENSORBASE_PLUGIN
+    #include EIGEN_READONLY_TENSORBASE_PLUGIN
+    #endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  protected:
+    template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
+    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+    template <typename OtherDerived, int AccessLevel> friend class Eigen::TensorBase;
+};
+
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
+ public:
+    typedef TensorBase<Derived, ReadOnlyAccessors> Base;
+    typedef internal::traits<Derived> DerivedTraits;
+    typedef typename DerivedTraits::Scalar Scalar;
+    typedef typename DerivedTraits::Index Index;
+    typedef Scalar CoeffReturnType;
+    static constexpr int NumDimensions = DerivedTraits::NumDimensions;
+
+    template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
+    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+    template <typename OtherDerived, int OtherAccessLevel> friend class Eigen::TensorBase;
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setZero() {
+      return setConstant(Scalar(0));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
+      return derived() = this->constant(val);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->random();
+    }
+    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->template random<RandomGenerator>();
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setValues(
+        const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
+      TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice());
+      internal::initialize_tensor<Derived, NumDimensions>(eval, vals);
+      return derived();
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator+=(const OtherDerived& other) {
+      return derived() = derived() + other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator-=(const OtherDerived& other) {
+      return derived() = derived() - other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const OtherDerived& other) {
+      return derived() = derived() * other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const OtherDerived& other) {
+      return derived() = derived() / other.derived();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorLayoutSwapOp<const Derived>
+    swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorLayoutSwapOp<Derived>
+    swap_layout() {
+      return TensorLayoutSwapOp<Derived>(derived());
+    }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<const Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, const Axis& axis) const {
+      return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis);
+    }
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorConcatenationOp<const Axis, Derived, OtherDerived>
+    concatenate(const OtherDerived& other, const Axis& axis) {
+      return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis);
+    }
+
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+    }
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReshapingOp<const NewDimensions, Derived>
+    reshape(const NewDimensions& newDimensions) {
+      return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
+    }
+
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorSlicingOp<const StartIndices, const Sizes, Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) {
+      return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
+    }
+
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                const Derived>(derived(), startIndices, stopIndices, strides);
+    }
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                Derived>(derived(), startIndices, stopIndices, strides);
+    }
+
+    template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<DimId, const Derived>
+    chip(const Index offset) const {
+      EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<DimId, Derived>
+    chip(const Index offset) {
+      EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
+      return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<Dynamic, Derived>
+    chip(const Index offset, const Index dim) {
+      return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
+    }
+
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReverseOp<const ReverseDimensions, Derived>
+    reverse(const ReverseDimensions& rev) {
+      return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev);
+    }
+
+    template <typename Rolls> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorRollOp<const Rolls, const Derived>
+    roll(const Rolls& roll) const {
+      return TensorRollOp<const Rolls, const Derived>(derived(), roll);
+    }
+    template <typename Rolls> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorRollOp<const Rolls, Derived>
+    roll(const Rolls& roll) {
+      return TensorRollOp<const Rolls, Derived>(derived(), roll);
+    }
+
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shfl) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
+    }
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorShufflingOp<const Shuffle, Derived>
+    shuffle(const Shuffle& shfl) {
+      return TensorShufflingOp<const Shuffle, Derived>(derived(), shfl);
+    }
+
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorStridingOp<const Strides, Derived>
+    stride(const Strides& strides) {
+      return TensorStridingOp<const Strides, Derived>(derived(), strides);
+    }
+
+    // Select the device on which to evaluate the expression.
+    template <typename DeviceType>
+    TensorDevice<Derived, DeviceType> device(const DeviceType& dev) {
+      return TensorDevice<Derived, DeviceType>(dev, derived());
+    }
+
+    // Select the async device on which to evaluate the expression.
+    template <typename DeviceType, typename DoneCallback>
+    TensorAsyncDevice<Derived, DeviceType, DoneCallback> device(const DeviceType& dev, DoneCallback done) {
+      return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+    #ifdef EIGEN_TENSORBASE_PLUGIN
+    #include EIGEN_TENSORBASE_PLUGIN
+    #endif
+
+ protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase)
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase)
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Derived, const OtherDerived> Assign;
+      Assign assign(derived(), other.derived());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return derived();
+    }
+};
+#endif // EIGEN_PARSED_BY_DOXYGEN
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
new file mode 100644
index 00000000000..0b068a7c99e
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -0,0 +1,1474 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+// -------------------------------------------------------------------------- //
+// Forward declarations for templates defined below.
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO;
+
+// -------------------------------------------------------------------------- //
+// Helper function to compute strides for densely stored buffer of given
+// dimensions.
+
+// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
+// this function instead everywhere.
+template <int Layout, typename IndexType, int NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const DSizes<IndexType, NumDims>& dimensions) {
+  DSizes<IndexType, NumDims> strides;
+  if (NumDims == 0) return strides;
+
+  // TODO(ezhulenev): Use templates to unroll this loop (similar to
+  // h_array_reduce in CXX11meta.h)? Benchmark it.
+  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * dimensions[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * dimensions[i + 1];
+    }
+  }
+
+  return strides;
+}
+
+template <int Layout, typename IndexType, size_t NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const Eigen::array<IndexType, NumDims>& dimensions) {
+  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
+}
+
+template <int Layout, std::ptrdiff_t... Indices>
+EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(const Sizes<Indices...>& sizes) {
+  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
+}
+
+// -------------------------------------------------------------------------- //
+
+// Tensor block shape type defines what are the shape preference for the blocks
+// extracted from the larger tensor.
+//
+// Example: blocks of 100 elements from the large 100x100 tensor:
+// - tensor: 100x100
+// - target_block_size: 100
+//
+// TensorBlockShapeType:
+//  - kUniformAllDims: 100 blocks of size 10x10
+//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
+//                      or row major layout)
+enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
+
+struct TensorBlockResourceRequirements {
+  TensorBlockShapeType shape_type;  // target block shape
+  size_t size;                      // target block size
+  TensorOpCost cost_per_coeff;      // cost of computing a single block element
+
+#ifdef EIGEN_HIPCC
+  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
+  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
+  // errors out complaining about the lack of a matching constructor
+  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, TensorOpCost cost_)
+      : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) {}
+#endif
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,
+                                                                            size_t size_in_bytes, TensorOpCost cost) {
+    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
+    return {shape_type, size, cost};
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,
+                                                                            size_t size_in_bytes) {
+    // This default cost per coefficient is valid for most materialized tensor
+    // block evaluation implementations, because they typically just read
+    // coefficients from the underlying tensor storage, and write to the tensor
+    // block buffer (scratch or destination memory, reads and writes have linear
+    // access pattern). We ignore the fixed cost of block evaluation, because in
+    // practice it should negligible.
+    //
+    // Lazy block evaluation adds the cost of calling a functor for each
+    // coefficient.
+    //
+    // All non-trivial block evaluation implementations must provide their own
+    // cost approximation (e.g. shuffling inner dimension has a much higher cost
+    // because it reads memory randomly, although the total number of moved
+    // bytes is the same).
+    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
+                                    {/*bytes_loaded=*/sizeof(Scalar),
+                                     /*bytes_stored=*/sizeof(Scalar),
+                                     /*compute_cycles=*/0});
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims, size_in_bytes);
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims, size_in_bytes);
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
+  merge(const TensorBlockResourceRequirements& lhs, const TensorBlockResourceRequirements& rhs) {
+    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
+            merge(lhs.size, rhs.size),                       // size
+            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
+  }
+
+  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(TensorOpCost cost) {
+    cost_per_coeff += cost;
+    return *this;
+  }
+
+  // This is a resource requirement that should be returned from expressions
+  // that do not have any block evaluation preference (e.g. default tensor
+  // expression with raw buffer access).
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
+    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
+  }
+
+ private:
+  using Requirements = TensorBlockResourceRequirements;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
+    return numext::maxi(lhs_size, rhs_size);
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs,
+                                                                          TensorBlockShapeType rhs) {
+    return (lhs == TensorBlockShapeType::kSkewedInnerDims || rhs == TensorBlockShapeType::kSkewedInnerDims)
+               ? TensorBlockShapeType::kSkewedInnerDims
+               : TensorBlockShapeType::kUniformAllDims;
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost) {
+    return lhs_cost + rhs_cost;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockDescriptor specifies a block offset within a tensor and the block
+// sizes along each of the tensor dimensions.
+
+template <int NumDims, typename IndexType = Eigen::Index>
+class TensorBlockDescriptor {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  // If we evaluate a Tensor assignment, and expression on the left, already has
+  // a memory buffer, then we might do performance optimization, and evaluate
+  // the root expression directly into the final output memory. Some time it's
+  // possible to reuse it for materializing subexpressions inside an expression
+  // tree, to to avoid dynamic memory allocation.
+  //
+  // The pointer type of the underlying storage is erased, because passing
+  // Scalar type through all the expression evaluation layers is way too many
+  // templates. In practice destination buffer type should always match the
+  // evaluated expression scalar type.
+  class DestinationBuffer {
+   public:
+    enum DestinationBufferKind : int {
+      // The above explicit specification of "int" as the enum basetype is
+      // needed to get around a HIPCC link error ("the field type is not
+      // amp-compatible")
+      // which is issued for class members with the enum type.
+      // TODO(rocm):
+      // remove the "int" basetype once HIPCC has been fixed to not error out
+      // in the above scenario.
+
+      // Destination buffer is not defined (`m_data` == nullptr).
+      kEmpty,
+
+      // Tensor block defined by an owning tensor block descriptor can fit
+      // contiguously into the destination buffer. In this case it's safe to
+      // materialize tensor block in the destination buffer, wrap it in a
+      // TensorMap, and use to build Eigen expression on top of it.
+      kContiguous,
+
+      // Destination buffer strides do not match strides of the contiguously
+      // stored block, and it's impossible to define a TensorMap over this
+      // buffer. However if we are evaluating a root of an expression tree, we
+      // still can materialize an output into this destination, because we can
+      // guarantee that no one will ever access it through block API.
+      //
+      // In theory it is possible to build valid TensorStriding<TensorMap>
+      // expression on top of this destination buffer, however it has
+      // inefficient coeff/packet access, and defeats the purpose of fast block
+      // evaluation API.
+      kStrided
+    };
+
+    template <typename Scalar>
+    Scalar* data() const {
+      eigen_assert(m_data_type_size == sizeof(Scalar));
+      return static_cast<Scalar*>(m_data);
+    }
+
+    const Dimensions& strides() const { return m_strides; }
+    const DestinationBufferKind& kind() const { return m_kind; }
+
+   private:
+    friend class TensorBlockDescriptor<NumDims, IndexType>;
+
+    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
+
+    template <typename Scalar>
+    DestinationBuffer(Scalar* data, const Dimensions& strides, DestinationBufferKind kind)
+        : m_data(static_cast<void*>(data)), m_data_type_size(sizeof(Scalar)), m_strides(strides), m_kind(kind) {}
+
+    template <int Layout, typename Scalar>
+    static DestinationBuffer make(const TensorBlockDescriptor& desc, Scalar* data, const Dimensions& strides) {
+      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
+    }
+
+    template <int Layout>
+    static DestinationBufferKind kind(const TensorBlockDescriptor& desc, const Dimensions& strides) {
+      const Dimensions& desc_dims = desc.dimensions();
+      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
+      for (int i = 0; i < NumDims; ++i) {
+        if (desc_dims[i] == 1) continue;
+        if (desc_strides[i] != strides[i]) return kStrided;
+      }
+      return kContiguous;
+    }
+
+    // Storage pointer is type erased, to reduce template bloat, but we still
+    // keep the size of the underlying element type for error checking.
+    void* m_data;
+    size_t m_data_type_size;
+
+    // Destination buffer dimensions always match the dimensions of a tensor
+    // block descriptor it belongs to, however strides might be different.
+    Dimensions m_strides;
+
+    DestinationBufferKind m_kind;
+  };
+
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, const DestinationBuffer& destination)
+      : m_offset(offset), m_dimensions(dimensions), m_destination(destination) {}
+
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
+      : m_offset(offset), m_dimensions(dimensions), m_destination(DestinationBuffer()) {}
+
+  IndexType offset() const { return m_offset; }
+  const Dimensions& dimensions() const { return m_dimensions; }
+  IndexType dimension(int index) const { return m_dimensions[index]; }
+  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
+
+  const DestinationBuffer& destination() const { return m_destination; }
+
+  template <int Layout, typename Scalar>
+  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
+    eigen_assert(dst_base != NULL);
+    m_destination = DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
+  }
+
+  template <int Layout, typename Scalar, typename DstStridesIndexType>
+  void AddDestinationBuffer(Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
+    // DSizes constructor will do index type promotion if it's safe.
+    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
+  }
+
+  TensorBlockDescriptor& DropDestinationBuffer() {
+    m_destination.m_data = NULL;
+    m_destination.m_kind = DestinationBuffer::kEmpty;
+    return *this;
+  }
+
+  bool HasDestinationBuffer() const { return m_destination.kind() != DestinationBuffer::kEmpty; }
+
+  // Returns a copy of `*this` with updated offset.
+  TensorBlockDescriptor WithOffset(IndexType offset) const {
+    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
+  }
+
+ private:
+  // Offset and dimensions are immutable after construction. Block descriptor
+  // can only be mutated by adding or dropping destination.
+  const IndexType m_offset;
+  const Dimensions m_dimensions;
+  DestinationBuffer m_destination;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
+
+template <int NumDims, int Layout, typename IndexType = Eigen::Index>
+class TensorBlockMapper {
+  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  TensorBlockMapper() = default;
+  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions, const TensorBlockResourceRequirements& requirements)
+      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
+    // Compute block dimensions and the total number of blocks.
+    InitializeBlockDimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { return m_total_block_count; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { return m_block_dimensions.TotalSize(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>& blockDimensions() const {
+    return m_block_dimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const {
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    IndexType offset = 0;
+    DSizes<IndexType, NumDims> dimensions;
+
+    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
+
+    // Iterate outer -> inner dimensions.
+    for (int i = NumDims - 1; i >= 0; --i) {
+      const int dim = isColMajor ? i : NumDims - i - 1;
+
+      const IndexType idx = block_index / m_block_strides[dim];
+      block_index -= idx * m_block_strides[dim];
+
+      const IndexType coord = idx * m_block_dimensions[dim];
+      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, m_block_dimensions[dim]);
+      offset += coord * m_tensor_strides[dim];
+    }
+
+    return {offset, dimensions};
+  }
+
+ private:
+  void InitializeBlockDimensions() {
+    // Requested block shape and size.
+    const TensorBlockShapeType shape_type = m_requirements.shape_type;
+    IndexType target_block_size = numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
+
+    IndexType tensor_size = m_tensor_dimensions.TotalSize();
+
+    // Corner case: one of the dimensions is zero. Logic below is too complex
+    // to handle this case on a general basis, just use unit block size.
+    // Note: we must not yield blocks with zero dimensions (recipe for
+    // overflows/underflows, divisions by zero and NaNs later).
+    if (tensor_size == 0) {
+      for (int i = 0; i < NumDims; ++i) {
+        m_block_dimensions[i] = 1;
+      }
+      m_total_block_count = 0;
+      return;
+    }
+
+    // If tensor fits into a target block size, evaluate it as a single block.
+    if (tensor_size <= target_block_size) {
+      m_block_dimensions = m_tensor_dimensions;
+      m_total_block_count = 1;
+      // The only valid block index is `0`, and in this case we do not need
+      // to compute real strides for tensor or blocks (see blockDescriptor).
+      for (int i = 0; i < NumDims; ++i) {
+        m_tensor_strides[i] = 0;
+        m_block_strides[i] = 1;
+      }
+      return;
+    }
+
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    // Block shape skewed towards inner dimension.
+    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
+      IndexType coeff_to_allocate = target_block_size;
+
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+        m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
+        coeff_to_allocate =
+            numext::div_ceil(coeff_to_allocate, numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
+      }
+      eigen_assert(coeff_to_allocate == 1);
+
+    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
+      // Tensor will not fit within 'target_block_size' budget: calculate tensor
+      // block dimension sizes based on "square" dimension size target.
+      const IndexType dim_size_target = convert_index<IndexType>(
+          std::pow(static_cast<float>(target_block_size), 1.0f / static_cast<float>(m_block_dimensions.rank())));
+
+      for (int i = 0; i < NumDims; ++i) {
+        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
+        // a multiple of the packet size. Note that reducing
+        // 'block_dim_size' in this manner can increase the number of
+        // blocks, and so will amplify any per-block overhead.
+        m_block_dimensions[i] = numext::mini(dim_size_target, m_tensor_dimensions[i]);
+      }
+
+      // Add any un-allocated coefficients to inner dimension(s).
+      IndexType total_size = m_block_dimensions.TotalSize();
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+
+        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
+          const IndexType total_size_other_dims = total_size / m_block_dimensions[dim];
+          const IndexType alloc_avail = numext::div_ceil<IndexType>(target_block_size, total_size_other_dims);
+          if (alloc_avail == m_block_dimensions[dim]) {
+            // Insufficient excess coefficients to allocate.
+            break;
+          }
+          m_block_dimensions[dim] = numext::mini(m_tensor_dimensions[dim], alloc_avail);
+          total_size = total_size_other_dims * m_block_dimensions[dim];
+        }
+      }
+
+    } else {
+      eigen_assert(false);  // unknown block shape
+    }
+
+    eigen_assert(m_block_dimensions.TotalSize() >=
+                 numext::mini<IndexType>(target_block_size, m_tensor_dimensions.TotalSize()));
+
+    // Calculate block counts by dimension and total block count.
+    DSizes<IndexType, NumDims> block_count;
+    for (int i = 0; i < NumDims; ++i) {
+      block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]);
+    }
+    m_total_block_count = array_prod(block_count);
+
+    // Calculate block strides (used for enumerating blocks).
+    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
+    m_block_strides = strides<Layout>(block_count);
+  }
+
+  DSizes<IndexType, NumDims> m_tensor_dimensions;
+  TensorBlockResourceRequirements m_requirements;
+
+  DSizes<IndexType, NumDims> m_block_dimensions;
+  IndexType m_total_block_count;
+
+  DSizes<IndexType, NumDims> m_tensor_strides;
+  DSizes<IndexType, NumDims> m_block_strides;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockScratchAllocator is responsible for allocating temporary buffers
+// for block evaluation (output or input block materialization). Given that
+// Eigen expression traversal order is deterministic, all temporary allocations
+// are happening in the same order, and usually have exactly the same size.
+// Scratch allocator keeps a trace of all dynamic allocations, and after the
+// first block evaluation is completed, we should be able to reuse all the
+// temporary buffers for the next block evaluation.
+
+template <typename Device>
+class TensorBlockScratchAllocator {
+ public:
+  explicit TensorBlockScratchAllocator(const Device& device) : m_device(device), m_allocation_index(0) {}
+
+  ~TensorBlockScratchAllocator() {
+    for (size_t i = 0; i < m_allocations.size(); ++i) {
+      m_device.deallocate(m_allocations[i].ptr);
+    }
+  }
+
+  void* allocate(size_t size) {
+    // TODO(ezhulenev): Remove when replaced with inlined vector.
+    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
+
+    // Check if we already have an existing allocation att current index.
+    const int num_allocations = static_cast<int>(m_allocations.size());
+    const bool has_allocation = m_allocation_index < num_allocations;
+
+    // Allocation index can't be larger than the number of allocations.
+    eigen_assert(m_allocation_index <= num_allocations);
+
+    // If we have existing allocation, and its size is larger or equal to
+    // requested size, we do nothing.
+
+    // If current allocation can't fit requested size, we deallocate it, and
+    // replace with a larger allocation.
+    if (has_allocation && m_allocations[m_allocation_index].size < size) {
+      m_device.deallocate(m_allocations[m_allocation_index].ptr);
+      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
+      m_allocations[m_allocation_index].size = size;
+    }
+
+    // Make a new allocation if we don't have and existing one.
+    if (!has_allocation) {
+      Allocation allocation;
+      allocation.ptr = m_device.allocate(size);
+      allocation.size = size;
+      m_allocations.push_back(allocation);
+    }
+
+    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
+    eigen_assert(m_allocations[m_allocation_index].size >= size);
+
+    return m_allocations[m_allocation_index++].ptr;
+  }
+
+  void reset() { m_allocation_index = 0; }
+
+ private:
+  struct Allocation {
+    void* ptr;
+    size_t size;
+  };
+
+  const Device& m_device;
+  int m_allocation_index;
+  // TODO(ezhulenev): This should be an inlined vector.
+  std::vector<Allocation> m_allocations;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockKind represents all possible block kinds, that can be produced by
+// TensorEvaluator::evalBlock function.
+enum TensorBlockKind {
+  // Tensor block that is a lazy expression that must be assigned to a
+  // destination using TensorBlockAssign.
+  kExpr,
+
+  // Tensor block that is a view into a memory buffer owned by an underlying
+  // Tensor expression (e.g. it can be a view into a Tensor buffer).
+  kView,
+
+  // Tensor block that was materialized in a scratch memory buffer, allocated
+  // with TensorBlockScratchAllocator. This block must be copied to a
+  // destination, similar to a block of `kExpr` type.
+  kMaterializedInScratch,
+
+  // Tensor block that was materialized directly into the final output memory
+  // buffer. For example if the left side of an assignment is a Tensor, we can
+  // directly materialize the block in the destination memory.
+  //
+  // If strides in the output buffer do not match tensor block strides, the
+  // Tensor expression will be invalid, and should not be used by
+  // TensorBlockAssign or for constructing another block expression.
+  kMaterializedInOutput
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
+// TensorEvaluators that do not support block evaluation.
+
+class TensorBlockNotImplemented {
+ public:
+  typedef void XprType;
+};
+
+// -------------------------------------------------------------------------- //
+// XprScalar extracts Scalar type from the Eigen expressions (if expression type
+// is not void). It's required to be able to define lazy block expression for
+// argument types, that do not support block evaluation.
+
+template <typename XprType>
+struct XprScalar {
+  typedef typename XprType::Scalar type;
+};
+template <>
+struct XprScalar<void> {
+  typedef void type;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorMaterializedBlock is a fully evaluated block of the original tensor,
+// and XprType is just a TensorMap over the data. This block type is typically
+// used to materialize blocks of tensor expressions, that can't be efficiently
+// represented as lazy Tensor expressions with fast coeff/packet operations,
+// e.g. we materialize all broadcasts into evaluated blocks.
+//
+// TensorMaterializedBlock does not own its memory buffer, it's either a memory
+// buffer that backs the original expression (e.g. block is just a view into a
+// Tensor), or a memory buffer allocated with scratch allocator, and in this
+// case the scratch allocator will deallocate it at the end of block based
+// expression execution.
+//
+// If the block was evaluated directly into the output buffer, and strides in
+// the output buffer do not match block strides, the TensorMap expression will
+// be invalid, and should never be used in block assignment or any other tensor
+// expression.
+
+template <typename Scalar, int NumDims, int Layout, typename IndexType = Eigen::Index>
+class TensorMaterializedBlock {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
+
+  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, const Dimensions& dimensions,
+                          bool valid_expr = true)
+      : m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) {
+    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
+  }
+
+  TensorBlockKind kind() const { return m_kind; }
+  // NOTE(ezhulenev): Returning XprType by value like in other block types
+  // causes asan failures. The theory is that XprType::Nested doesn't work
+  // properly for TensorMap.
+  const XprType& expr() const {
+    eigen_assert(m_valid_expr);
+    return m_expr;
+  }
+  const Scalar* data() const { return m_data; }
+  void cleanup() {}
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+
+  // TensorMaterializedBlock can be backed by different types of storage:
+  //
+  //   (1) Contiguous block of memory allocated with scratch allocator.
+  //   (2) Contiguous block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //   (3) Strided block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //
+  class Storage {
+   public:
+    Scalar* data() const { return m_data; }
+    const Dimensions& dimensions() const { return m_dimensions; }
+    const Dimensions& strides() const { return m_strides; }
+
+    TensorMaterializedBlock AsTensorMaterializedBlock() const {
+      return TensorMaterializedBlock(m_materialized_in_output ? internal::TensorBlockKind::kMaterializedInOutput
+                                                              : internal::TensorBlockKind::kMaterializedInScratch,
+                                     m_data, m_dimensions, !m_strided_storage);
+    }
+
+   private:
+    friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;
+
+    Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output,
+            bool strided_storage)
+        : m_data(data),
+          m_dimensions(dimensions),
+          m_strides(strides),
+          m_materialized_in_output(materialized_in_output),
+          m_strided_storage(strided_storage) {}
+
+    Scalar* m_data;
+    Dimensions m_dimensions;
+    Dimensions m_strides;
+    bool m_materialized_in_output;
+    bool m_strided_storage;
+  };
+
+  // Creates a storage for materialized block either from the block descriptor
+  // destination buffer, or allocates a new buffer with scratch allocator.
+  template <typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static Storage prepareStorage(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                    bool allow_strided_storage = false) {
+    // Try to reuse destination as an output block buffer.
+    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
+
+    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/true,
+                     /*strided_storage=*/false);
+
+    } else if (desc.destination().kind() == DestinationBuffer::kStrided && allow_strided_storage) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
+                     /*materialized_in_output=*/true, /*strided_storage=*/true);
+
+    } else {
+      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+      return Storage(static_cast<Scalar*>(mem), desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/false,
+                     /*strided_storage=*/false);
+    }
+  }
+
+  // Creates a materialized block for the given descriptor from a memory buffer.
+  template <typename DataDimensions, typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(const Scalar* data, const DataDimensions& data_dims,
+                                                                 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
+
+    // If a tensor block dimensions covers a contiguous block of the underlying
+    // memory, we can skip block buffer memory allocation, and construct a block
+    // from existing `data` memory buffer.
+    //
+    // Example: (RowMajor layout)
+    //   data_dims:          [11, 12, 13, 14]
+    //   desc.dimensions():  [1,   1,  3, 14]
+    //
+    // In this case we can construct a TensorBlock starting at
+    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Find out how many inner dimensions have a matching size.
+    int num_matching_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (data_dims[dim] != desc.dimensions()[dim]) break;
+      ++num_matching_inner_dims;
+    }
+
+    // All the outer dimensions must be of size `1`, except a single dimension
+    // before the matching inner dimension (`3` in the example above).
+    bool can_use_direct_access = true;
+    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (desc.dimension(dim) != 1) {
+        can_use_direct_access = false;
+        break;
+      }
+    }
+
+    if (can_use_direct_access) {
+      const Scalar* block_start = data + desc.offset();
+      return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions());
+
+    } else {
+      // Reuse destination buffer or allocate new buffer with scratch allocator.
+      const Storage storage = prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout> TensorBlockIO;
+      typedef typename TensorBlockIO::Dst TensorBlockIODst;
+      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), data, desc.offset());
+      TensorBlockIODst dst(storage.dimensions(), storage.strides(), storage.data());
+
+      TensorBlockIO::Copy(dst, src);
+      return storage.AsTensorMaterializedBlock();
+    }
+  }
+
+ private:
+  TensorBlockKind m_kind;
+  const Scalar* m_data;
+  Dimensions m_dimensions;
+  XprType m_expr;
+  bool m_valid_expr;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename UnaryOp, typename ArgTensorBlock>
+class TensorCwiseUnaryBlock {
+  static constexpr bool NoArgBlockAccess = internal::is_void<typename ArgTensorBlock::XprType>::value;
+
+ public:
+  typedef std::conditional_t<NoArgBlockAccess, void,
+                             TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >
+      XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
+      : m_arg_block(arg_block), m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  UnaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
+class TensorCwiseBinaryBlock {
+  static constexpr bool NoArgBlockAccess = internal::is_void<typename LhsTensorBlock::XprType>::value ||
+                                           internal::is_void<typename RhsTensorBlock::XprType>::value;
+
+ public:
+  typedef std::conditional_t<
+      NoArgBlockAccess, void,
+      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType, const typename RhsTensorBlock::XprType> >
+      XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, const RhsTensorBlock& right_block, const BinaryOp& functor)
+      : m_left_block(left_block), m_right_block(right_block), m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const { return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); }
+
+  const Scalar* data() const { return NULL; }
+
+  void cleanup() {
+    m_left_block.cleanup();
+    m_right_block.cleanup();
+  }
+
+ private:
+  LhsTensorBlock m_left_block;
+  RhsTensorBlock m_right_block;
+  BinaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorUnaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from a block of the underlying type (this is a
+// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
+
+template <typename BlockFactory, typename ArgTensorBlock>
+class TensorUnaryExprBlock {
+  typedef typename ArgTensorBlock::XprType ArgXprType;
+  static constexpr bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
+
+ public:
+  typedef std::conditional_t<NoArgBlockAccess, void, typename BlockFactory::template XprType<ArgXprType>::type> XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorUnaryExprBlock(const ArgTensorBlock& arg_block, const BlockFactory& factory)
+      : m_arg_block(arg_block), m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorTernaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from three blocks of the underlying type.
+
+template <typename BlockFactory, typename Arg1TensorBlock, typename Arg2TensorBlock, typename Arg3TensorBlock>
+class TensorTernaryExprBlock {
+  typedef typename Arg1TensorBlock::XprType Arg1XprType;
+  typedef typename Arg2TensorBlock::XprType Arg2XprType;
+  typedef typename Arg3TensorBlock::XprType Arg3XprType;
+
+  static constexpr bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
+                                           internal::is_void<Arg2XprType>::value ||
+                                           internal::is_void<Arg3XprType>::value;
+
+ public:
+  typedef std::conditional_t<NoArgBlockAccess, void,
+                             typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, Arg3XprType>::type>
+      XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, const Arg2TensorBlock& arg2_block,
+                         const Arg3TensorBlock& arg3_block, const BlockFactory& factory)
+      : m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const { return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() {
+    m_arg1_block.cleanup();
+    m_arg2_block.cleanup();
+    m_arg3_block.cleanup();
+  }
+
+ private:
+  Arg1TensorBlock m_arg1_block;
+  Arg2TensorBlock m_arg2_block;
+  Arg3TensorBlock m_arg3_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// StridedLinearBufferCopy provides a method to copy data between two linear
+// buffers with different strides, with optimized paths for scatter/gather.
+
+template <typename Scalar, typename IndexType>
+class StridedLinearBufferCopy {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  enum {
+    Vectorizable = packet_traits<Scalar>::Vectorizable,
+    PacketSize = packet_traits<Scalar>::size,
+    HalfPacketSize = unpacket_traits<HalfPacket>::size,
+    HasHalfPacket = static_cast<int>(HalfPacketSize) < static_cast<int>(PacketSize)
+  };
+
+ public:
+  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
+  enum class Kind {
+    Linear = 0,       // src_stride == 1 && dst_stride == 1
+    Scatter = 1,      // src_stride == 1 && dst_stride != 1
+    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
+    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
+    Gather = 4,       // dst_stride == 1
+    Random = 5        // everything else
+  };
+
+  struct Dst {
+    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    Scalar* data;
+  };
+
+  struct Src {
+    Src(IndexType o, IndexType s, const Scalar* d) : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    const Scalar* data;
+  };
+
+  template <typename StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, const Src& src, const size_t count) {
+    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data);
+  }
+
+ private:
+  template <typename StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset,
+                                                        const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+                                                        const IndexType src_offset, const IndexType src_stride,
+                                                        const Scalar* EIGEN_RESTRICT src_data) {
+    const Scalar* src = &src_data[src_offset];
+    Scalar* dst = &dst_data[dst_offset];
+
+    if (!Vectorizable) {
+      for (Index i = 0; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+      return;
+    }
+
+    const IndexType vectorized_size = PacketSize * (count / PacketSize);
+    IndexType i = 0;
+
+    if (kind == StridedLinearBufferCopy::Kind::Linear) {
+      // ******************************************************************** //
+      // Linear copy from `src` to `dst`.
+      const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
+      eigen_assert(src_stride == 1 && dst_stride == 1);
+      for (; i < unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          Packet p = ploadu<Packet>(src + i + j * PacketSize);
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i < vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      if (HasHalfPacket) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
+          HalfPacket p = ploadu<HalfPacket>(src + i);
+          pstoreu<Scalar, HalfPacket>(dst + i, p);
+          i += HalfPacketSize;
+        }
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
+      // Scatter from `src` to `dst`.
+      eigen_assert(src_stride == 1 && dst_stride != 1);
+      for (; i < vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      if (HasHalfPacket) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
+          HalfPacket p = ploadu<HalfPacket>(src + i);
+          pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
+          i += HalfPacketSize;
+        }
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
+      // Fill `dst` with value at `*src`.
+      eigen_assert(src_stride == 0 && dst_stride == 1);
+
+      const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
+      Scalar s = *src;
+      Packet p = pset1<Packet>(s);
+      for (; i < unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i < vectorized_size; i += PacketSize) {
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      if (HasHalfPacket) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
+          HalfPacket hp = pset1<HalfPacket>(s);
+          pstoreu<Scalar, HalfPacket>(dst + i, hp);
+          i += HalfPacketSize;
+        }
+      }
+      for (; i < count; ++i) {
+        dst[i] = s;
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
+      // Scatter `*src` into `dst`.
+      eigen_assert(src_stride == 0 && dst_stride != 1);
+      Scalar s = *src;
+      Packet p = pset1<Packet>(s);
+      for (; i < vectorized_size; i += PacketSize) {
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      if (HasHalfPacket) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
+          HalfPacket hp = pset1<HalfPacket>(s);
+          pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
+          i += HalfPacketSize;
+        }
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = s;
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
+      // Gather from `src` into `dst`.
+      eigen_assert(dst_stride == 1);
+      for (; i < vectorized_size; i += PacketSize) {
+        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      if (HasHalfPacket) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
+          HalfPacket p = pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
+          pstoreu<Scalar, HalfPacket>(dst + i, p);
+          i += HalfPacketSize;
+        }
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i * src_stride];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
+      // Random.
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+    } else {
+      eigen_assert(false);
+    }
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
+// It's possible to specify src->dst dimension mapping for the copy operation.
+// Dimensions of `dst` specify how many elements have to be copied, for the
+// `src` we need to know only stride to navigate through source memory buffer.
+
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO {
+  static constexpr bool IsColMajor = (Layout == ColMajor);
+
+  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef DSizes<int, NumDims> DimensionsMap;
+
+  struct Dst {
+    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0)
+        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  struct Src {
+    Src(const Dimensions& src_strides, const Scalar* src, IndexType src_offset = 0)
+        : strides(src_strides), data(src), offset(src_offset) {}
+
+    Dimensions strides;
+    const Scalar* data;
+    IndexType offset;
+  };
+
+  // Copies data to `dst` from `src`, using provided dimensions mapping:
+  //
+  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
+  //
+  // Returns the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst& dst, const Src& src,
+                                                              const DimensionsMap& dst_to_src_dim_map) {
+    // Copy single scalar value from `src` to `dst`.
+    if (NumDims == 0) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Both `dst` and `src` must have contiguous innermost dimension. We also
+    // accept the special case with stride '0', because it's used as a trick to
+    // implement broadcasting.
+    {
+      int inner_dim = IsColMajor ? 0 : NumDims - 1;
+      EIGEN_UNUSED_VARIABLE(inner_dim);
+      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
+      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
+    }
+
+    // Give a shorter name to `dst_to_src_dim_map`.
+    const DimensionsMap& dim_map = dst_to_src_dim_map;
+
+    // Do not squeeze reordered inner dimensions.
+    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
+
+    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
+    // block, and we write data linearly into that dimension, reading it from
+    // the src. If dimensions are reordered, we might end up reading data from
+    // the src with `stride != 1`.
+    //
+    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
+    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
+
+    // Find the innermost dimension in the dst whose size is not 1. This is the
+    // effective inner dim.
+    int num_size_one_inner_dims = 0;
+    for (int i = 0; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      if (dst.dims[dst_dim] != 1) break;
+      num_size_one_inner_dims++;
+    }
+
+    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
+    if (num_size_one_inner_dims == NumDims) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
+    const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1;
+
+    // Dimension in the src that corresponds to the dst innermost dimension.
+    const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
+
+    // Size of the innermost dimension (length of contiguous blocks of memory).
+    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
+
+    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
+    // `src` memory, so we can do less linear copy calls.
+    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      const IndexType dst_stride = dst.strides[dst_dim];
+      const IndexType src_stride = src.strides[dim_map[dst_dim]];
+      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
+        dst_inner_dim_size *= dst.dims[dst_dim];
+        ++num_size_one_inner_dims;
+      } else {
+        break;
+      }
+    }
+
+    // Setup strides to read data from `src` and write to `dst`.
+    IndexType input_offset = src.offset;
+    IndexType output_offset = dst.offset;
+    IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
+    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
+
+    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> it;
+
+    // Initialize block iterator state. Squeeze away any dimension of size 1.
+    int idx = 0;  // currently initialized iterator state index
+    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      if (dst.dims[dst_dim] == 1) continue;
+
+      it[idx].size = dst.dims[dst_dim];
+      it[idx].input_stride = src.strides[dim_map[dst_dim]];
+      it[idx].output_stride = dst.strides[dst_dim];
+
+      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+
+      idx++;
+    }
+
+    // Iterate copying data from src to dst.
+    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
+
+#define COPY_INNER_DIM(KIND)                                                                                      \
+  IndexType num_copied = 0;                                                                                       \
+  for (num_copied = 0; num_copied < block_total_size; num_copied += dst_inner_dim_size) {                         \
+    LinCopy::template Run<KIND>(typename LinCopy::Dst(output_offset, output_stride, dst.data),                    \
+                                typename LinCopy::Src(input_offset, input_stride, src.data), dst_inner_dim_size); \
+                                                                                                                  \
+    for (int j = 0; j < idx; ++j) {                                                                               \
+      if (++it[j].count < it[j].size) {                                                                           \
+        input_offset += it[j].input_stride;                                                                       \
+        output_offset += it[j].output_stride;                                                                     \
+        break;                                                                                                    \
+      }                                                                                                           \
+      it[j].count = 0;                                                                                            \
+      input_offset -= it[j].input_span;                                                                           \
+      output_offset -= it[j].output_span;                                                                         \
+    }                                                                                                             \
+  }                                                                                                               \
+  return num_copied;
+
+    if (input_stride == 1 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Linear);
+    } else if (input_stride == 1 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Scatter);
+    } else if (input_stride == 0 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::FillLinear);
+    } else if (input_stride == 0 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Kind::FillScatter);
+    } else if (output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Gather);
+    } else {
+      COPY_INNER_DIM(LinCopy::Kind::Random);
+    }
+
+#undef COPY_INNER_DIM
+  }
+
+  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
+  // the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, const Src& src) {
+    DimensionsMap dst_to_src_map;
+    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
+    return Copy(dst, src, dst_to_src_map);
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState() : size(0), count(0), input_stride(0), output_stride(0), input_span(0), output_span(0) {}
+
+    IndexType size;
+    IndexType count;
+    IndexType input_stride;
+    IndexType output_stride;
+    IndexType input_span;
+    IndexType output_span;
+  };
+
+  // Compute how many inner dimensions it's allowed to squeeze when doing IO
+  // between two tensor blocks. It's safe to squeeze inner dimensions, only
+  // if they are not reordered.
+  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
+    int num_squeezable_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      if (dim_map[dim] != dim) break;
+      num_squeezable_dims++;
+    }
+    return num_squeezable_dims;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
+// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
+//
+// Currently there is no way to write from a Tensor expression to a block of
+// memory, if dimensions are reordered. If you need to do that, you should
+// materialize a Tensor block expression into a memory buffer, and then use
+// TensorBlockIO to copy data between two memory buffers with a custom
+// `target->src` dimension map (see definition above).
+//
+// Also currently the innermost dimension of `target` must have a stride '1'
+// (contiguous in memory). This restriction could be lifted with a `pscatter`,
+// but in practice it's never needed, and there is a similar TensorBlockIO
+// workaround for that.
+//
+// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
+// where `src` is a tensor expression. Explore if it is possible to rewrite IO
+// to use expressions instead of pointers, and after that TensorBlockAssignment
+// will become an alias to IO.
+template <typename Scalar, int NumDims, typename TensorBlockExpr, typename IndexType = Eigen::Index>
+class TensorBlockAssignment {
+  // We will use coeff/packet path to evaluate block expressions.
+  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice> TensorBlockEvaluator;
+
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  enum { Vectorizable = packet_traits<Scalar>::Vectorizable, PacketSize = packet_traits<Scalar>::size };
+
+  template <bool Vectorizable, typename Evaluator>
+  struct InnerDimAssign {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {
+      for (IndexType i = 0; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+  template <typename Evaluator>
+  struct InnerDimAssign<true, Evaluator> {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {
+      typedef typename packet_traits<Scalar>::type Packet;
+
+      const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
+      const IndexType vectorized_size = PacketSize * (count / PacketSize);
+      IndexType i = 0;
+
+      for (; i < unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          const IndexType idx = eval_offset + i + j * PacketSize;
+          Packet p = eval.template packet<Unaligned>(idx);
+          pstoreu<Scalar>(target + i + j * PacketSize, p);
+        }
+      }
+
+      for (; i < vectorized_size; i += PacketSize) {
+        Packet p = eval.template packet<Unaligned>(eval_offset + i);
+        pstoreu<Scalar>(target + i, p);
+      }
+
+      for (; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+ public:
+  struct Target {
+    Target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,
+           IndexType target_offset = 0)
+        : dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  static Target target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,
+                       IndexType target_offset = 0) {
+    return Target(target_dims, target_strides, target_data, target_offset);
+  }
+
+  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
+  static Target target(const DSizes<TargetDimsIndexType, NumDims>& target_dims,
+                       const DSizes<TargetStridesIndexType, NumDims>& target_strides, Scalar* target_data,
+                       IndexType target_offset = 0) {
+    // DSizes constructor will do index type promotion if it's safe.
+    return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset);
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target& target, const TensorBlockExpr& expr) {
+    // Prepare evaluator for block expression.
+    DefaultDevice default_device;
+    TensorBlockEvaluator eval(expr, default_device);
+
+    // Tensor block expression dimension should match destination dimensions.
+    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
+
+    static const int Layout = TensorBlockEvaluator::Layout;
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Initialize output inner dimension size based on a layout.
+    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
+    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
+    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
+
+    // Target inner dimension stride must be '1'.
+    eigen_assert(target.strides[inner_dim_idx] == 1);
+
+    // Squeeze multiple inner dims into one if they are contiguous in `target`.
+    IndexType num_squeezed_dims = 0;
+    for (Index i = 1; i < NumDims; ++i) {
+      const Index dim = is_col_major ? i : NumDims - i - 1;
+      const IndexType target_stride = target.strides[dim];
+
+      if (output_inner_dim_size == target_stride) {
+        output_inner_dim_size *= target.dims[dim];
+        num_squeezed_dims++;
+      } else {
+        break;
+      }
+    }
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+
+    int idx = 0;  // currently initialized iterator state index
+    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
+      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
+
+      it[idx].count = 0;
+      it[idx].size = target.dims[dim];
+      it[idx].output_stride = target.strides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // We read block expression from the beginning, and start writing data to
+    // `target` at given offset.
+    IndexType input_offset = 0;
+    IndexType output_offset = target.offset;
+
+    // Iterate copying data from `eval` to `target`.
+    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
+      // Assign to `target` at current offset.
+      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess, TensorBlockEvaluator>::Run(
+          target.data + output_offset, output_inner_dim_size, eval, input_offset);
+
+      // Move input offset forward by the number of assigned coefficients.
+      input_offset += output_inner_dim_size;
+
+      // Update index.
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
+          break;
+        }
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
+      }
+    }
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState() : count(0), size(0), output_stride(0), output_span(0) {}
+
+    IndexType count;
+    IndexType size;
+    IndexType output_stride;
+    IndexType output_span;
+  };
+};
+
+// -------------------------------------------------------------------------- //
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
new file mode 100644
index 00000000000..9ef4bbce092
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -0,0 +1,1001 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorBroadcasting
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor broadcasting class.
+ *
+ *
+ */
+namespace internal {
+template <typename Broadcast, typename XprType>
+struct traits<TensorBroadcastingOp<Broadcast, XprType>> : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Broadcast, typename XprType>
+struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense> {
+  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename Broadcast, typename XprType>
+struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1,
+              typename eval<TensorBroadcastingOp<Broadcast, XprType>>::type> {
+  typedef TensorBroadcastingOp<Broadcast, XprType> type;
+};
+
+template <typename Dims>
+struct is_input_scalar {
+  static const bool value = false;
+};
+template <>
+struct is_input_scalar<Sizes<>> {
+  static const bool value = true;
+};
+template <typename std::ptrdiff_t... Indices>
+struct is_input_scalar<Sizes<Indices...>> {
+  static constexpr bool value = (Sizes<Indices...>::total_size == 1);
+};
+
+}  // end namespace internal
+
+template <typename Broadcast, typename XprType>
+class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
+      : m_xpr(expr), m_broadcast(broadcast) {}
+
+  EIGEN_DEVICE_FUNC const Broadcast& broadcast() const { return m_broadcast; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Broadcast m_broadcast;
+};
+
+// Eval as rvalue
+template <typename Broadcast, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> {
+  typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+ protected:  //  all the non-static fields must have the same access control, otherwise the TensorEvaluator won't be
+             //  standard layout;
+  bool isCopy, nByOne, oneByN;
+
+ public:
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    RawAccess = false
+  };
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  // We do block based broadcasting using a trick with 2x tensor rank and 0
+  // strides. See block method implementation for details.
+  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : isCopy(false),
+        nByOne(false),
+        oneByN(false),
+        m_device(device),
+        m_broadcast(op.broadcast()),
+        m_impl(op.expression(), device) {
+    // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
+    // and store the result in a scalar. Instead one should reshape the scalar into a N-D
+    // tensor with N >= 1 of 1 element first and then broadcast.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    const InputDimensions& input_dims = m_impl.dimensions();
+    isCopy = true;
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i] * m_broadcast[i];
+      if (m_broadcast[i] != 1) {
+        isCopy = false;
+      }
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_inputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+
+    if (input_dims[0] == 1) {
+      oneByN = true;
+      for (int i = 1; i < NumDims; ++i) {
+        if (m_broadcast[i] != 1) {
+          oneByN = false;
+          break;
+        }
+      }
+    } else if (input_dims[NumDims - 1] == 1) {
+      nByOne = true;
+      for (int i = 0; i < NumDims - 1; ++i) {
+        if (m_broadcast[i] != 1) {
+          nByOne = false;
+          break;
+        }
+      }
+    }
+
+    // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
+    // broadcast shape is '[N, 1..., N]'
+    if (!oneByN && !nByOne) {
+      if (input_dims[0] == 1 && input_dims[NumDims - 1] == 1 && NumDims > 2) {
+        nByOne = true;
+        oneByN = true;
+        for (int i = 1; i < NumDims - 1; ++i) {
+          if (m_broadcast[i] != 1) {
+            nByOne = false;
+            oneByN = false;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
+    if (internal::is_input_scalar<internal::remove_all_t<InputDimensions>>::value) {
+      return m_impl.coeff(0);
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffColMajor(index);
+      }
+    } else {
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffRowMajor(index);
+      }
+    }
+  }
+
+  // TODO: attempt to speed this up. The integer divisions and modulo are slow
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    if (internal::index_statically_eq<Broadcast>(0, 1)) {
+      eigen_assert(index < m_impl.dimensions()[0]);
+      inputIndex += index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
+        eigen_assert(index % m_impl.dimensions()[0] == 0);
+      } else {
+        inputIndex += (index % m_impl.dimensions()[0]);
+      }
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const {
+    return m_impl.coeff(indexColMajor(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
+      inputIndex += index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
+      } else {
+        inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
+      }
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const {
+    return m_impl.coeff(indexRowMajor(index));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
+    if (internal::is_input_scalar<internal::remove_all_t<InputDimensions>>::value) {
+      return internal::pset1<PacketReturnType>(m_impl.coeff(0));
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      if (isCopy) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+        // unaligned loads here. The reason is unclear though.
+        return m_impl.template packet<Unaligned>(index);
+#else
+        return m_impl.template packet<LoadMode>(index);
+#endif
+      } else if (oneByN && !nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetColMajor<LoadMode>(index);
+      }
+    } else {
+      if (isCopy) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+        // See above.
+        return m_impl.template packet<Unaligned>(index);
+#else
+        return m_impl.template packet<LoadMode>(index);
+#endif
+      } else if (oneByN && !nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetRowMajor<LoadMode>(index);
+      }
+    }
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    Index startDim, endDim;
+    Index inputIndex, outputOffset, batchedIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      startDim = NumDims - 1;
+      endDim = 1;
+    } else {
+      startDim = 0;
+      endDim = NumDims - 2;
+    }
+
+    batchedIndex = index % m_outputStrides[startDim];
+    inputIndex = batchedIndex / m_outputStrides[endDim];
+    outputOffset = batchedIndex % m_outputStrides[endDim];
+
+    if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[endDim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          ++inputIndex;
+          inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
+          values[i] = m_impl.coeff(inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const {
+    // Consider the flattened tensor [v0, ..., vN],
+    // Concatenates m_broadcast[dim] copies,
+    //    [v0, ..., vN, v0, ..., vN, ... ]
+    // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    // Size of flattened tensor.
+    const Index M =
+        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_inputStrides[NumDims - 1] : m_inputStrides[0];
+    Index inputIndex = index % M;
+    if (inputIndex + PacketSize <= M) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        if (inputIndex > M - 1) {
+          inputIndex = 0;
+        }
+        values[i] = m_impl.coeff(inputIndex++);
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const {
+    // Consider the flattened tensor [v0, ..., vN],
+    // Interleaves m_broadcast[dim] copies,
+    //    [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
+    // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    const Index M =
+        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_broadcast[0] : m_broadcast[NumDims - 1];
+
+    Index inputIndex = index / M;
+    Index outputOffset = index % M;
+    if (outputOffset + PacketSize <= M) {
+      return internal::pset1<PacketReturnType>(m_impl.coeff(inputIndex));
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        if (outputOffset < M) {
+          values[i] = m_impl.coeff(inputIndex);
+          ++outputOffset;
+        } else {
+          values[i] = m_impl.coeff(++inputIndex);
+          outputOffset = 1;  // Next offset.
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
+  // the alignment at compile time.
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    const Index originalIndex = index;
+
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    Index innermostLoc;
+    if (internal::index_statically_eq<Broadcast>(0, 1)) {
+      eigen_assert(index < m_impl.dimensions()[0]);
+      innermostLoc = index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
+        eigen_assert(index % m_impl.dimensions()[0] == 0);
+        innermostLoc = 0;
+      } else {
+        innermostLoc = index % m_impl.dimensions()[0];
+      }
+    }
+    inputIndex += innermostLoc;
+
+    // Todo: this could be extended to the second dimension if we're not
+    // broadcasting alongside the first dimension, and so on.
+    if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      values[0] = m_impl.coeff(inputIndex);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize; ++i) {
+        if (innermostLoc + i < m_impl.dimensions()[0]) {
+          values[i] = m_impl.coeff(inputIndex + i);
+        } else {
+          values[i] = coeffColMajor(originalIndex + i);
+        }
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    const Index originalIndex = index;
+
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    Index innermostLoc;
+    if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
+      innermostLoc = index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
+        innermostLoc = 0;
+      } else {
+        innermostLoc = index % m_impl.dimensions()[NumDims - 1];
+      }
+    }
+    inputIndex += innermostLoc;
+
+    // Todo: this could be extended to the second dimension if we're not
+    // broadcasting alongside the first dimension, and so on.
+    if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims - 1]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      values[0] = m_impl.coeff(inputIndex);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize; ++i) {
+        if (innermostLoc + i < m_impl.dimensions()[NumDims - 1]) {
+          values[i] = m_impl.coeff(inputIndex + i);
+        } else {
+          values[i] = coeffRowMajor(originalIndex + i);
+        }
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = TensorOpCost::AddCost<Index>();
+    if (!isCopy && NumDims > 0) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        compute_cost += TensorOpCost::DivCost<Index>();
+        if (internal::index_statically_eq<Broadcast>(i, 1)) {
+          compute_cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+        } else {
+          if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
+            compute_cost +=
+                TensorOpCost::MulCost<Index>() + TensorOpCost::ModCost<Index>() + TensorOpCost::AddCost<Index>();
+          }
+        }
+        compute_cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+      }
+    }
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
+    // tensors. But this might need further tuning.
+    const size_t target_size = m_device.firstLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        m_impl.getResourceRequirements(), internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    BlockBroadcastingParams params = blockBroadcastingParams(desc);
+
+    if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
+      return emptyBlock();
+    }
+
+    // Prepare storage for the materialized broadcasting result.
+    const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+    ScalarNoConst* materialized_output = block_storage.data();
+
+    // We potentially will need to materialize input blocks.
+    size_t materialized_input_size = 0;
+    ScalarNoConst* materialized_input = NULL;
+
+    // Initialize block broadcating iterator state for outer dimensions (outer
+    // with regard to bcast dimension). Dimension in this array are always in
+    // inner_most -> outer_most order (col major layout).
+    array<BlockBroadcastingIteratorState, NumDims> it;
+    int idx = 0;
+
+    for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
+      const Index dim = IsColMajor ? i : NumDims - 1 - i;
+      it[idx].size = params.output_dims[dim];
+      it[idx].count = 0;
+      it[idx].output_stride = m_outputStrides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // Write output into the beginning of `materialized_output`.
+    Index output_offset = 0;
+
+    // We will fill output block by broadcasting along the bcast dim, and
+    // iterating over outer dimension.
+    const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
+
+    for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
+      ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
+      Index bcast_offset = desc.offset() + output_offset;
+
+      // Broadcast along the bcast dimension.
+      num_output_coeffs += BroadcastBlockAlongBcastDim(params, bcast_offset, scratch, bcast_output, &materialized_input,
+                                                       &materialized_input_size);
+
+      // Switch to the next outer dimension.
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
+          break;
+        }
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Broadcast functor() const { return m_broadcast; }
+
+ private:
+  static constexpr bool IsColMajor = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+  // We will build a general case block broadcasting on top of broadcasting
+  // primitive that will do broadcasting only for the inner dimension(s) along
+  // the first dimension smaller than the input size (it's called `bcast_dim`).
+  //
+  // Example:
+  //           dim:  0  1  2   (ColMajor)
+  //    input size: [9, 3, 6]
+  //    block size: [9, 2, 6]
+  //
+  // We will compute broadcasted block by iterating over the outer dimensions
+  // before `bcast_dim` (only dimension `2` in this example) and computing
+  // broadcasts along the `bcast_dim` (dimension `1` in this example).
+
+  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
+  // single block along the broadcasting dimension. Sizes and strides along the
+  // `bcast_dim` might be invalid, they will be adjusted later in
+  // `BroadcastBlockAlongBcastDim`.
+  struct BlockBroadcastingParams {
+    Dimensions input_dims;      // input expression dimensions
+    Dimensions output_dims;     // output block sizes
+    Dimensions output_strides;  // output block strides
+
+    int inner_dim_count;   // count inner dimensions matching in size
+    int bcast_dim;         // broadcasting dimension index
+    Index bcast_dim_size;  // broadcasting dimension size
+    Index inner_dim_size;  // inner dimensions size
+
+    // Block sizes and strides for the input block where all dimensions before
+    // `bcast_dim` are equal to `1`.
+    Dimensions input_block_sizes;
+    Dimensions input_block_strides;
+
+    // Block sizes and strides for blocks with extra dimensions and strides `0`.
+    BroadcastDimensions bcast_block_sizes;
+    BroadcastDimensions bcast_block_strides;
+    BroadcastDimensions bcast_input_strides;
+  };
+
+  struct BlockBroadcastingIteratorState {
+    Index size;
+    Index count;
+    Index output_stride;
+    Index output_span;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc& desc) const {
+    BlockBroadcastingParams params;
+
+    params.input_dims = Dimensions(m_impl.dimensions());
+
+    // Output block sizes and strides.
+    params.output_dims = desc.dimensions();
+    params.output_strides = internal::strides<Layout>(params.output_dims);
+
+    // Find the broadcasting dimension (first dimension with output size smaller
+    // that the input size).
+    params.bcast_dim = 0;
+    params.bcast_dim_size = 1;
+    params.inner_dim_size = 1;
+
+    // Count the number of inner dimensions that have the same size in the block
+    // and in the broadcast expression.
+    params.inner_dim_count = 0;
+
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+
+      if (params.output_dims[dim] == m_dimensions[dim]) {
+        params.inner_dim_size *= params.output_dims[dim];
+        ++params.inner_dim_count;
+        continue;
+      }
+
+      // First non-matching dimension is the broadcasting dimension.
+      eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
+      params.bcast_dim = dim;
+      params.bcast_dim_size = params.output_dims[dim];
+      break;
+    }
+
+    // Calculate the input block size for looking into the input.
+    for (int i = 0; i < params.inner_dim_count; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      params.input_block_sizes[dim] = params.input_dims[dim];
+    }
+    for (int i = params.inner_dim_count; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      params.input_block_sizes[dim] = 1;
+    }
+    params.input_block_strides = internal::strides<Layout>(params.input_block_sizes);
+
+    // Broadcast with the 0-stride trick: Create 1 extra dim for each
+    // broadcast, set the input stride to 0.
+    //
+    // When ColMajor:
+    //
+    // - bcast_block_sizes:
+    //   [d_0, b_0, d_1, b_1, ...]
+    //
+    // - bcast_block_strides:
+    //   [output_block_strides[0], output_block_strides[0] * d_0,
+    //    output_block_strides[1], output_block_strides[1] * d_1,
+    //   ...]
+    //
+    // - bcast_input_strides:
+    //   [input_block_strides[0], 0,
+    //    input_block_strides[1], 0,
+    //   ...].
+    //
+    for (int i = 0; i < params.inner_dim_count; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+
+      const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
+      const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
+
+      params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
+      params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
+      params.bcast_block_strides[copy_dim] = params.output_strides[dim];
+      params.bcast_block_strides[broadcast_dim] = params.output_strides[dim] * params.input_dims[dim];
+      params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
+      params.bcast_input_strides[broadcast_dim] = 0;
+    }
+
+    for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
+      const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
+      params.bcast_block_sizes[dim] = 1;
+      params.bcast_block_strides[dim] = 0;
+      params.bcast_input_strides[dim] = 0;
+    }
+
+    return params;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
+    DSizes<Index, NumDims> dimensions;
+    for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
+    return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
+      BlockBroadcastingParams params, Index bcast_offset, TensorBlockScratch& scratch,
+      ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const {
+    if (params.bcast_dim_size == 1) {
+      // We just need one block read using the ready-set values above.
+      return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                            params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
+                            materialized_output, materialized_input, materialized_input_size);
+
+    } else if (params.input_dims[params.bcast_dim] == 1) {
+      // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
+      const int broadcast_bcast_dim =
+          IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+      params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
+      params.bcast_input_strides[broadcast_bcast_dim] = 0;
+      params.bcast_block_strides[broadcast_bcast_dim] = params.output_strides[params.bcast_dim];
+
+      return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                            params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
+                            materialized_output, materialized_input, materialized_input_size);
+
+    } else {
+      // Keep track of the total number of the coefficients written to the
+      // output block.
+      Index num_output_coeffs = 0;
+
+      // The general case. Let's denote the output block as
+      //
+      //   x[..., a:a+bcast_dim_size, :, ..., :]
+      //
+      // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
+      // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
+      // sub-blocks:
+      //
+      // (1) a:b, where b is the smallest multiple of
+      //     input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
+      //
+      // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
+      //     in [a, a+bcast_dim_size].
+      //
+      // (3) c:a+bcast_dim_size .
+      //
+      // Or, when b and c do not exist, we just need to process the whole block
+      // together.
+
+      // Find a.
+      const Index bcast_dim_left_index = bcast_offset / m_outputStrides[params.bcast_dim];
+
+      // Find b and c.
+      const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
+
+      // First multiple after a. This is b when <= bcast_dim_left_index +
+      // bcast_dim_size.
+      const Index first_multiple =
+          numext::div_ceil<Index>(bcast_dim_left_index, input_bcast_dim_size) * input_bcast_dim_size;
+
+      if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
+        // b exists, so does c. Find it.
+        const Index last_multiple =
+            (bcast_dim_left_index + params.bcast_dim_size) / input_bcast_dim_size * input_bcast_dim_size;
+        const int copy_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1;
+        const int broadcast_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+        if (first_multiple > bcast_dim_left_index) {
+          const Index head_size = first_multiple - bcast_dim_left_index;
+          params.input_block_sizes[params.bcast_dim] = head_size;
+          params.bcast_block_sizes[copy_bcast_dim] = head_size;
+          params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
+
+          num_output_coeffs +=
+              BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                             params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
+                             materialized_output, materialized_input, materialized_input_size);
+        }
+        if (first_multiple < last_multiple) {
+          params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
+          params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
+          params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = (last_multiple - first_multiple) / input_bcast_dim_size;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
+          const Index offset = (first_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim];
+
+          num_output_coeffs +=
+              BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                             params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch,
+                             materialized_output, materialized_input, materialized_input_size);
+        }
+        if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
+          const Index tail_size = bcast_dim_left_index + params.bcast_dim_size - last_multiple;
+          params.input_block_sizes[params.bcast_dim] = tail_size;
+          params.bcast_block_sizes[copy_bcast_dim] = tail_size;
+          params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
+          const Index offset = (last_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim];
+
+          num_output_coeffs +=
+              BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                             params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch,
+                             materialized_output, materialized_input, materialized_input_size);
+        }
+      } else {
+        // b and c do not exist.
+        const int copy_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1;
+        params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
+        params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
+        params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
+        params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
+
+        num_output_coeffs +=
+            BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                           params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
+                           materialized_output, materialized_input, materialized_input_size);
+      }
+
+      return num_output_coeffs;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
+      const Dimensions& input_block_sizes, const Dimensions& input_block_strides,
+      const BroadcastDimensions& bcast_block_sizes, const BroadcastDimensions& bcast_block_strides,
+      const BroadcastDimensions& bcast_input_strides, Index bcast_offset, Index offset, TensorBlockScratch& scratch,
+      ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const {
+    // ---------------------------------------------------------------------- //
+    // Tensor block descriptor for reading block from the input.
+    const Index input_offset = bcast_offset + offset;
+    TensorBlockDesc input_desc(IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
+                               input_block_sizes);
+
+    ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
+
+    // ---------------------------------------------------------------------- //
+    // Materialize input block into a temporary memory buffer only if it's not
+    // already available in the arg block.
+    const ScalarNoConst* input_buffer = NULL;
+
+    if (input_block.data() != NULL) {
+      // Input block already has raw data, there is no need to materialize it.
+      input_buffer = input_block.data();
+
+    } else {
+      // Otherwise we have to do block assignment into a temporary buffer.
+
+      // Maybe reuse previously allocated buffer, or allocate a new one with a
+      // scratch allocator.
+      const size_t input_total_size = input_block_sizes.TotalSize();
+      if (*materialized_input == NULL || *materialized_input_size < input_total_size) {
+        *materialized_input_size = input_total_size;
+        void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
+        *materialized_input = static_cast<ScalarNoConst*>(mem);
+      }
+
+      typedef internal::TensorBlockAssignment<ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(input_block_sizes, input_block_strides, *materialized_input),
+          input_block.expr());
+
+      input_buffer = *materialized_input;
+    }
+
+    // ---------------------------------------------------------------------- //
+    // Copy data from materialized input block to the materialized output, using
+    // given broadcast strides (strides with zeroes).
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout> TensorBlockIO;
+
+    typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
+    typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, materialized_output + offset);
+
+    return TensorBlockIO::Copy(dst, src);
+  }
+
+ protected:
+  const Device EIGEN_DEVICE_REF m_device;
+  const std::remove_reference_t<Broadcast> m_broadcast;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
new file mode 100644
index 00000000000..5db563d601b
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -0,0 +1,474 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorKChippingReshaping
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
+ *
+ *
+ */
+
+namespace internal {
+template <DenseIndex DimId, typename XprType>
+struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions - 1;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <DenseIndex DimId, typename XprType>
+struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense> {
+  typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <DenseIndex DimId, typename XprType>
+struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type> {
+  typedef TensorChippingOp<DimId, XprType> type;
+};
+
+template <DenseIndex DimId>
+struct DimensionId {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
+    EIGEN_UNUSED_VARIABLE(dim);
+    eigen_assert(dim == DimId);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { return DimId; }
+};
+template <>
+struct DimensionId<Dynamic> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { eigen_assert(dim >= 0); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { return actual_dim; }
+
+ private:
+  const DenseIndex actual_dim;
+};
+
+}  // end namespace internal
+
+template <DenseIndex DimId, typename XprType>
+class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> > {
+ public:
+  typedef TensorBase<TensorChippingOp<DimId, XprType> > Base;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+      : m_xpr(expr), m_offset(offset), m_dim(dim) {
+    eigen_assert(dim < XprType::NumDimensions && dim >= 0 && "Chip_Dim_out_of_range");
+  }
+
+  EIGEN_DEVICE_FUNC const Index offset() const { return m_offset; }
+  EIGEN_DEVICE_FUNC const Index dim() const { return m_dim.actualDim(); }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Index m_offset;
+  const internal::DimensionId<DimId> m_dim;
+};
+
+// Eval as rvalue
+template <DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> {
+  typedef TensorChippingOp<DimId, ArgType> XprType;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static constexpr int NumDims = NumInputDims - 1;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets.
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    // Chipping of outer-most dimension is a trivial operation, because we can
+    // read and write directly from the underlying tensor using single offset.
+    IsOuterChipping = (Layout == ColMajor && DimId == NumInputDims - 1) || (Layout == RowMajor && DimId == 0),
+    // Chipping inner-most dimension.
+    IsInnerChipping = (Layout == ColMajor && DimId == 0) || (Layout == RowMajor && DimId == NumInputDims - 1),
+    // Prefer block access if the underlying expression prefers it, otherwise
+    // only if chipping is not trivial.
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess || !IsOuterChipping,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef internal::TensorBlockDescriptor<NumInputDims, Index> ArgTensorBlockDesc;
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) {
+    EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(NumInputDims > m_dim.actualDim());
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    eigen_assert(op.offset() < input_dims[m_dim.actualDim()]);
+
+    int j = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (i != m_dim.actualDim()) {
+        m_dimensions[j] = input_dims[i];
+        ++j;
+      }
+    }
+
+    m_stride = 1;
+    m_inputStride = 1;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < m_dim.actualDim(); ++i) {
+        m_stride *= input_dims[i];
+        m_inputStride *= input_dims[i];
+      }
+    } else {
+      for (int i = NumInputDims - 1; i > m_dim.actualDim(); --i) {
+        m_stride *= input_dims[i];
+        m_inputStride *= input_dims[i];
+      }
+    }
+    m_inputStride *= input_dims[m_dim.actualDim()];
+    m_inputOffset = m_stride * op.offset();
+
+    // Check if chipping is effectively inner or outer: products of dimensions
+    // before or after the chipped dimension is `1`.
+    Index after_chipped_dim_product = 1;
+    for (int i = static_cast<int>(m_dim.actualDim()) + 1; i < NumInputDims; ++i) {
+      after_chipped_dim_product *= input_dims[i];
+    }
+
+    Index before_chipped_dim_product = 1;
+    for (int i = 0; i < m_dim.actualDim(); ++i) {
+      before_chipped_dim_product *= input_dims[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_isEffectivelyInnerChipping = before_chipped_dim_product == 1;
+      m_isEffectivelyOuterChipping = after_chipped_dim_product == 1;
+    } else {
+      m_isEffectivelyInnerChipping = after_chipped_dim_product == 1;
+      m_isEffectivelyOuterChipping = before_chipped_dim_product == 1;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    if (isInnerChipping()) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(m_stride == 1);
+      Index inputIndex = index * m_inputStride + m_inputOffset;
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = m_impl.coeff(inputIndex);
+        inputIndex += m_inputStride;
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    } else if (isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer division.
+      eigen_assert(m_stride > index);
+      return m_impl.template packet<LoadMode>(index + m_inputOffset);
+    } else {
+      const Index idx = index / m_stride;
+      const Index rem = index - idx * m_stride;
+      if (rem + PacketSize <= m_stride) {
+        Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
+        return m_impl.template packet<LoadMode>(inputIndex);
+      } else {
+        // Cross the stride boundary. Fallback to slow path.
+        EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+        EIGEN_UNROLL_LOOP
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = coeff(index);
+          ++index;
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double cost = 0;
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+        (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) {
+      cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
+               (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+      cost += TensorOpCost::AddCost<Index>();
+    } else {
+      cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() + 3 * TensorOpCost::AddCost<Index>();
+    }
+
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size), m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool root_of_expr_ast = false) const {
+    const Index chip_dim = m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i] = i < chip_dim ? desc.dimension(i) : i > chip_dim ? desc.dimension(i - 1) : 1;
+    }
+
+    ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
+
+    // Try to reuse destination buffer for materializing argument block.
+    if (desc.HasDestinationBuffer()) {
+      DSizes<Index, NumInputDims> arg_destination_strides;
+      for (int i = 0; i < NumInputDims; ++i) {
+        arg_destination_strides[i] = i < chip_dim   ? desc.destination().strides()[i]
+                                     : i > chip_dim ? desc.destination().strides()[i - 1]
+                                                    : 0;  // for dimensions of size `1` stride should never be used.
+      }
+
+      arg_desc.template AddDestinationBuffer<Layout>(desc.destination().template data<ScalarNoConst>(),
+                                                     arg_destination_strides);
+    }
+
+    ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+
+    if (arg_block.data() != NULL) {
+      // Forward argument block buffer if possible.
+      return TensorBlock(arg_block.kind(), arg_block.data(), desc.dimensions());
+
+    } else {
+      // Assign argument block expression to a buffer.
+
+      // Prepare storage for the materialized chipping result.
+      const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockAssignment<ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(arg_desc.dimensions(), internal::strides<Layout>(arg_desc.dimensions()),
+                                        block_storage.data()),
+          arg_block.expr());
+
+      return block_storage.AsTensorMaterializedBlock();
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    typename Storage::Type result = constCast(m_impl.data());
+    if (isOuterChipping() && result) {
+      return result + m_inputOffset;
+    } else {
+      return NULL;
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex;
+    if (isInnerChipping()) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(m_stride == 1);
+      inputIndex = index * m_inputStride + m_inputOffset;
+    } else if (isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer
+      // division.
+      eigen_assert(m_stride > index);
+      inputIndex = index + m_inputOffset;
+    } else {
+      const Index idx = index / m_stride;
+      inputIndex = idx * m_inputStride + m_inputOffset;
+      index -= idx * m_stride;
+      inputIndex += index;
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const {
+    return IsInnerChipping || m_isEffectivelyInnerChipping;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const {
+    return IsOuterChipping || m_isEffectivelyOuterChipping;
+  }
+
+  Dimensions m_dimensions;
+  Index m_stride;
+  Index m_inputOffset;
+  Index m_inputStride;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const internal::DimensionId<DimId> m_dim;
+  const Device EIGEN_DEVICE_REF m_device;
+
+  // If product of all dimensions after or before the chipped dimension is `1`,
+  // it is effectively the same as chipping innermost or outermost dimension.
+  bool m_isEffectivelyInnerChipping;
+  bool m_isEffectivelyOuterChipping;
+};
+
+// Eval as lvalue
+template <DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
+    : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
+  typedef TensorChippingOp<DimId, ArgType> XprType;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static constexpr int NumDims = NumInputDims - 1;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    if (this->isInnerChipping()) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(this->m_stride == 1);
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+      Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        this->m_impl.coeffRef(inputIndex) = values[i];
+        inputIndex += this->m_inputStride;
+      }
+    } else if (this->isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer division.
+      eigen_assert(this->m_stride > index);
+      this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
+    } else {
+      const Index idx = index / this->m_stride;
+      const Index rem = index - idx * this->m_stride;
+      if (rem + PacketSize <= this->m_stride) {
+        const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
+        this->m_impl.template writePacket<StoreMode>(inputIndex, x);
+      } else {
+        // Cross stride boundary. Fallback to slow path.
+        EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+        internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+        EIGEN_UNROLL_LOOP
+        for (int i = 0; i < PacketSize; ++i) {
+          this->coeffRef(index) = values[i];
+          ++index;
+        }
+      }
+    }
+  }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+
+    const Index chip_dim = this->m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i] = i < chip_dim ? desc.dimension(i) : i > chip_dim ? desc.dimension(i - 1) : 1;
+    }
+
+    typedef TensorReshapingOp<const DSizes<Index, NumInputDims>, const typename TensorBlock::XprType> TensorBlockExpr;
+
+    typedef internal::TensorBlockAssignment<Scalar, NumInputDims, TensorBlockExpr, Index> TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(input_block_dims, internal::strides<Layout>(this->m_impl.dimensions()),
+                                  this->m_impl.data(), this->srcCoeff(desc.offset())),
+        block.expr().reshape(input_block_dims));
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
new file mode 100644
index 00000000000..e9c130dceb2
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -0,0 +1,354 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorConcatenationOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor concatenation class.
+ *
+ *
+ */
+namespace internal {
+template <typename Axis, typename LhsXprType, typename RhsXprType>
+struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> > {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  static constexpr int NumDimensions = traits<LhsXprType>::NumDimensions;
+  static constexpr int Layout = traits<LhsXprType>::Layout;
+  enum { Flags = 0 };
+  typedef std::conditional_t<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                             typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>
+      PointerType;
+};
+
+template <typename Axis, typename LhsXprType, typename RhsXprType>
+struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense> {
+  typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
+};
+
+template <typename Axis, typename LhsXprType, typename RhsXprType>
+struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1,
+              typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type> {
+  typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+template <typename Axis, typename LhsXprType, typename RhsXprType>
+class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> {
+ public:
+  typedef TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> Base;
+  typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
+  typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorConcatenationOp>::Index Index;
+  typedef typename internal::nested<TensorConcatenationOp>::type Nested;
+  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename LhsXprType::Nested>& lhsExpression() const {
+    return m_lhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename RhsXprType::Nested>& rhsExpression() const {
+    return m_rhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp)
+ protected:
+  typename LhsXprType::Nested m_lhs_xpr;
+  typename RhsXprType::Nested m_rhs_xpr;
+  const Axis m_axis;
+};
+
+// Eval as rvalue
+template <typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> {
+  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+  static constexpr int RightNumDims =
+      internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess =
+        TensorEvaluator<LeftArgType, Device>::PacketAccess && TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+                             static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) ||
+                         NumDims == 1),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    eigen_assert(0 <= m_axis && m_axis < NumDims);
+    const Dimensions& lhs_dims = m_leftImpl.dimensions();
+    const Dimensions& rhs_dims = m_rightImpl.dimensions();
+    {
+      int i = 0;
+      for (; i < m_axis; ++i) {
+        eigen_assert(lhs_dims[i] > 0);
+        eigen_assert(lhs_dims[i] == rhs_dims[i]);
+        m_dimensions[i] = lhs_dims[i];
+      }
+      eigen_assert(lhs_dims[i] > 0);  // Now i == m_axis.
+      eigen_assert(rhs_dims[i] > 0);
+      m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
+      for (++i; i < NumDims; ++i) {
+        eigen_assert(lhs_dims[i] > 0);
+        eigen_assert(lhs_dims[i] == rhs_dims[i]);
+        m_dimensions[i] = lhs_dims[i];
+      }
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_leftStrides[0] = 1;
+      m_rightStrides[0] = 1;
+      m_outputStrides[0] = 1;
+
+      for (int j = 1; j < NumDims; ++j) {
+        m_leftStrides[j] = m_leftStrides[j - 1] * lhs_dims[j - 1];
+        m_rightStrides[j] = m_rightStrides[j - 1] * rhs_dims[j - 1];
+        m_outputStrides[j] = m_outputStrides[j - 1] * m_dimensions[j - 1];
+      }
+    } else {
+      m_leftStrides[NumDims - 1] = 1;
+      m_rightStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+
+      for (int j = NumDims - 2; j >= 0; --j) {
+        m_leftStrides[j] = m_leftStrides[j + 1] * lhs_dims[j + 1];
+        m_rightStrides[j] = m_rightStrides[j + 1] * rhs_dims[j + 1];
+        m_outputStrides[j] = m_outputStrides[j + 1] * m_dimensions[j + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
+  // See CL/76180724 comments for more ideas.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    // Collect dimension-wise indices (subs).
+    array<Index, NumDims> subs;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        subs[i] = index / m_outputStrides[i];
+        index -= subs[i] * m_outputStrides[i];
+      }
+      subs[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        subs[i] = index / m_outputStrides[i];
+        index -= subs[i] * m_outputStrides[i];
+      }
+      subs[NumDims - 1] = index;
+    }
+
+    const Dimensions& left_dims = m_leftImpl.dimensions();
+    if (subs[m_axis] < left_dims[m_axis]) {
+      Index left_index;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        left_index = subs[0];
+        EIGEN_UNROLL_LOOP
+        for (int i = 1; i < NumDims; ++i) {
+          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+        }
+      } else {
+        left_index = subs[NumDims - 1];
+        EIGEN_UNROLL_LOOP
+        for (int i = NumDims - 2; i >= 0; --i) {
+          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+        }
+      }
+      return m_leftImpl.coeff(left_index);
+    } else {
+      subs[m_axis] -= left_dims[m_axis];
+      const Dimensions& right_dims = m_rightImpl.dimensions();
+      Index right_index;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        right_index = subs[0];
+        EIGEN_UNROLL_LOOP
+        for (int i = 1; i < NumDims; ++i) {
+          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+        }
+      } else {
+        right_index = subs[NumDims - 1];
+        EIGEN_UNROLL_LOOP
+        for (int i = NumDims - 2; i >= 0; --i) {
+          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+        }
+      }
+      return m_rightImpl.coeff(right_index);
+    }
+  }
+
+  // TODO(phli): Add a real vectorization.
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>() + TensorOpCost::ModCost<Index>());
+    const double lhs_size = m_leftImpl.dimensions().TotalSize();
+    const double rhs_size = m_rightImpl.dimensions().TotalSize();
+    return (lhs_size / (lhs_size + rhs_size)) * m_leftImpl.costPerCoeff(vectorized) +
+           (rhs_size / (lhs_size + rhs_size)) * m_rightImpl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_leftStrides;
+  array<Index, NumDims> m_rightStrides;
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+  const Axis m_axis;
+};
+
+// Eval as lvalue
+template <typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+    : public TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> {
+  typedef TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> Base;
+  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+  typedef typename Base::Dimensions Dimensions;
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess =
+        TensorEvaluator<LeftArgType, Device>::PacketAccess && TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) : Base(op, device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    // Collect dimension-wise indices (subs).
+    array<Index, Base::NumDims> subs;
+    for (int i = Base::NumDims - 1; i > 0; --i) {
+      subs[i] = index / this->m_outputStrides[i];
+      index -= subs[i] * this->m_outputStrides[i];
+    }
+    subs[0] = index;
+
+    const Dimensions& left_dims = this->m_leftImpl.dimensions();
+    if (subs[this->m_axis] < left_dims[this->m_axis]) {
+      Index left_index = subs[0];
+      for (int i = 1; i < Base::NumDims; ++i) {
+        left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i];
+      }
+      return this->m_leftImpl.coeffRef(left_index);
+    } else {
+      subs[this->m_axis] -= left_dims[this->m_axis];
+      const Dimensions& right_dims = this->m_rightImpl.dimensions();
+      Index right_index = subs[0];
+      for (int i = 1; i < Base::NumDims; ++i) {
+        right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i];
+      }
+      return this->m_rightImpl.coeffRef(right_index);
+    }
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    for (int i = 0; i < packetSize; ++i) {
+      coeffRef(index + i) = values[i];
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
new file mode 100644
index 00000000000..6f1c9744ee0
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -0,0 +1,962 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorContraction
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor contraction class.
+ *
+ *
+ */
+namespace internal {
+
+template <typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>> {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename gebp_traits<std::remove_const_t<typename LhsXprType::Scalar>,
+                               std::remove_const_t<typename RhsXprType::Scalar>>::ResScalar Scalar;
+
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+
+  // From NumDims below.
+  static constexpr int NumDimensions =
+      traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
+  static constexpr int Layout = traits<LhsXprType>::Layout;
+  typedef std::conditional_t<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                             typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>
+      PointerType;
+
+  enum { Flags = 0 };
+};
+
+template <typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense> {
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type;
+};
+
+template <typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1,
+              typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>>::type> {
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type;
+};
+
+template <typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_,
+          typename Device_>
+struct traits<
+    TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_>> {
+  typedef Indices_ Indices;
+  typedef LeftArgType_ LeftArgType;
+  typedef RightArgType_ RightArgType;
+  typedef OutputKernelType_ OutputKernelType;
+  typedef Device_ Device;
+
+  // From NumDims below.
+  static constexpr int NumDimensions =
+      traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
+};
+
+// Helper class to allocate and deallocate temporary memory for packed buffers.
+template <typename LhsScalar, typename RhsScalar>
+struct TensorContractionBlockMemAllocator {
+  typedef void* BlockMemHandle;
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, const Index bk, const Index bn,
+                                                   LhsScalar** lhs_block, RhsScalar** rhs_block) {
+    eigen_assert(lhs_block);
+    eigen_assert(rhs_block);
+    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+    char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size));
+    *lhs_block = static_cast<LhsScalar*>(static_cast<void*>(block_mem));
+    *rhs_block = static_cast<RhsScalar*>(static_cast<void*>(block_mem + sz.lhs_size));
+    return block_mem;
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices(Device& d, const Index bm, const Index bk, const Index bn,
+                                                         const Index num_lhs, const Index num_rhs,
+                                                         const Index num_slices, std::vector<LhsScalar*>* lhs_blocks,
+                                                         std::vector<RhsScalar*>* rhs_blocks) {
+    eigen_assert(num_slices > 0);
+    eigen_assert(num_lhs >= 0 && num_rhs >= 0);
+    eigen_assert(num_lhs == 0 || lhs_blocks);
+    eigen_assert(num_rhs == 0 || rhs_blocks);
+    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+    void* block_mem = d.allocate((num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices);
+    eigen_assert(block_mem);
+    char* mem = static_cast<char*>(block_mem);
+
+    for (Index x = 0; x < num_slices; x++) {
+      if (num_lhs > 0) lhs_blocks[x].resize(num_lhs);
+      for (Index m = 0; m < num_lhs; m++) {
+        lhs_blocks[x][m] = static_cast<LhsScalar*>(static_cast<void*>(mem));
+        mem += sz.lhs_size;
+      }
+      if (num_rhs > 0) rhs_blocks[x].resize(num_rhs);
+      for (Index n = 0; n < num_rhs; n++) {
+        rhs_blocks[x][n] = static_cast<RhsScalar*>(static_cast<void*>(mem));
+        mem += sz.rhs_size;
+      }
+    }
+
+    return block_mem;
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+    d.deallocate(handle);
+  }
+
+ private:
+  struct BlockSizes {
+    Index lhs_size;
+    Index rhs_size;
+  };
+  EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, const Index bk, const Index bn) {
+    Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+    BlockSizes sz;
+    sz.lhs_size = numext::div_ceil<Index>(bm * bk * sizeof(LhsScalar), align) * align;
+    sz.rhs_size = numext::div_ceil<Index>(bn * bk * sizeof(RhsScalar), align) * align;
+    return sz;
+  }
+};
+
+// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
+// ColMajor storage order. This property is guaranteed by the
+// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
+// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
+// multiplication for these blocks. Default tensor contraction uses
+// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
+// GeneralBlocPanelKernel.h for details).
+//
+// By specializing contraction kernels we can use other low level libraries to
+// perform matrix multiplication, and still rely on Eigen contraction evaluator.
+// This also includes full support in TensorContractionThreadPool, assuming that
+// underlying gemm do not use it's own threading.
+//
+// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
+//   multiplication, lhs tensor and rhs tensor respectively.
+//
+// - StorageIndex - index type for the tensor expressions. In practice almost
+//   always is Eigen::Index.
+//
+// - OutputMapper provides access to the memory of the output matrix. In
+//   practice it's always column major blas_data_mapper (it must be of ResScalar
+//   type).
+//
+// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
+//   view into the Lhs/Rhs tensor expressions. In practice it's
+//   TensorContractionInputMapper, or some specialization of it based on the
+//   type of tensor expression (e.g. TensorImagePatchOp has optimized input
+//   mapper).
+template <typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, typename OutputMapper,
+          typename LhsMapper, typename RhsMapper>
+struct TensorContractionKernel {
+  // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C`
+  // (otherwise beta should be always equal to 1).
+  enum { HasBeta = false };
+
+  EIGEN_DEVICE_FUNC TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_, StorageIndex bm_,
+                                            StorageIndex bk_, StorageIndex bn_)
+      : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {}
+
+  // Pack blocks of Lhs and Rhs into contiguous blocks in memory.
+  typedef LhsScalar* LhsBlock;
+  typedef RhsScalar* RhsBlock;
+
+  // Packed Lhs/Rhs block memory allocator.
+  typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar> BlockMemAllocator;
+  typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;
+
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr,
+                                  Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
+      LhsPacker;
+
+  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
+      RhsPacker;
+
+  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex, OutputMapper, Traits::mr, Traits::nr,
+                                /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
+      GebpKernel;
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, RhsBlock* rhs_block) {
+    return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block);
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices(Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs,
+                                                  const StorageIndex num_slices, std::vector<LhsBlock>* lhs_blocks,
+                                                  std::vector<RhsBlock>* rhs_blocks) {
+    return BlockMemAllocator::allocateSlices(d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks);
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+    BlockMemAllocator::deallocate(d, handle);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
+                                                   const StorageIndex depth, const StorageIndex rows) {
+    LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
+                /*offset*/ 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
+                                                   const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(*rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(const OutputMapper& output_mapper, const LhsBlock& lhsBlock,
+                                                  const RhsBlock& rhsBlock, const StorageIndex rows,
+                                                  const StorageIndex depth, const StorageIndex cols,
+                                                  const ResScalar alpha, const ResScalar beta) {
+    // Default GEBP kernel does not support beta.
+    eigen_assert(beta == ResScalar(1));
+    static const int kComputeStrideFromBlockDimensions = -1;
+    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+                 /*strideA*/ kComputeStrideFromBlockDimensions,
+                 /*strideB*/ kComputeStrideFromBlockDimensions,
+                 /*offsetA*/ 0, /*offsetB*/ 0);
+  }
+
+ private:
+  // These are dimensions of the original Tensors, and selected block sizes. The
+  // actual block sizes passed to all function above might be smaller because of
+  // the partial blocks at the end.
+  const StorageIndex m;
+  const StorageIndex k;
+  const StorageIndex n;
+  const StorageIndex bm;
+  const StorageIndex bk;
+  const StorageIndex bn;
+};
+
+}  // end namespace internal
+
+// Tensor contraction params that should enable to get from output matrix
+// 2-dimensional coordinates to the output tensor dimensions.
+struct TensorContractionParams {
+  // TensorContraction evaluator assumes that both tensors are in ColMajor
+  // layout, if tensors are in RowMajor evaluator swap lhs with rhs.
+  bool swapped_arguments;
+};
+
+// Output kernel allows to fuse operations into the tensor contraction.
+//
+// Examples:
+//   1. Elementwise Relu transformation following Conv2D.
+//   2. AddBias to the Conv2D output channels dimension.
+//
+// The NoOpOutputKernel implements an output kernel that does absolutely nothing.
+struct NoOpOutputKernel {
+  /**
+   * Tensor contraction evaluator calls this kernel after finishing each block
+   * of output matrix. Output blocks belong to the 2-dimensional output tensor.
+   *
+   * TensorContractionParams contains contraction dimensions information
+   * required to map output 2-d space into the expected output tensor space
+   * (potentially higher dimensional).
+   *
+   * \param[in] output_mapper Access to output tensor memory
+   * \param[in] params   Tensor contraction parameters
+   * \param[in] i        Index of a first row available through output_mapper
+   * \param[in] j        Index of a first column available through output_mapper
+   * \param[in] num_rows Number of available rows
+   * \param[in] num_cols Number of available columns
+   */
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+                                      const TensorContractionParams& params, Index i, Index j, Index num_rows,
+                                      Index num_cols) const {
+    EIGEN_UNUSED_VARIABLE(output_mapper);
+    EIGEN_UNUSED_VARIABLE(params);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(num_rows);
+    EIGEN_UNUSED_VARIABLE(num_cols);
+  }
+};
+
+template <typename Indices, typename LhsXprType, typename RhsXprType,
+          typename OutputKernelType = const NoOpOutputKernel>
+class TensorContractionOp
+    : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
+  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+                                         typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs,
+                                                            const Indices& dims,
+                                                            const OutputKernelType& output_kernel = OutputKernelType())
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims), m_output_kernel(output_kernel) {}
+
+  EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename LhsXprType::Nested>& lhsExpression() const {
+    return m_lhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename RhsXprType::Nested>& rhsExpression() const {
+    return m_rhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const OutputKernelType& outputKernel() const { return m_output_kernel; }
+
+ protected:
+  typename LhsXprType::Nested m_lhs_xpr;
+  typename RhsXprType::Nested m_rhs_xpr;
+  const Indices m_indices;
+  const OutputKernelType m_output_kernel;
+};
+
+template <typename Derived>
+struct TensorContractionEvaluatorBase {
+  typedef typename internal::traits<Derived>::Indices Indices;
+  typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
+  typedef typename internal::traits<Derived>::RightArgType RightArgType;
+  typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType;
+  typedef typename internal::traits<Derived>::Device Device;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+  enum {
+    IsAligned = true,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = true
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>
+      EvalLeftArgType;
+  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>
+      EvalRightArgType;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
+
+  static constexpr int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static constexpr int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static constexpr int ContractDims = internal::array_size<Indices>::value;
+  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device)
+      : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), op.lhsExpression(),
+                          op.rhsExpression()),
+                   device),
+        m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), op.rhsExpression(),
+                           op.lhsExpression()),
+                    device),
+        m_device(device),
+        m_output_kernel(op.outputKernel()),
+        m_result(NULL) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    DSizes<Index, LDims> eval_left_dims;
+    DSizes<Index, RDims> eval_right_dims;
+    array<IndexPair<Index>, ContractDims> eval_op_indices;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // For ColMajor, we keep using the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[i];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[i];
+      }
+      // We keep the pairs of contracting indices.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = op.indices()[i].first;
+        eval_op_indices[i].second = op.indices()[i].second;
+      }
+    } else {
+      // For RowMajor, we need to reverse the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
+      }
+      // We need to flip all the pairs of contracting indices as well as
+      // reversing the dimensions.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
+        eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
+      }
+    }
+
+    // Check for duplicate axes and make sure the first index in eval_op_indices
+    // is increasing. Using O(n^2) sorting is OK since ContractDims is small
+    for (int i = 0; i < ContractDims; i++) {
+      for (int j = i + 1; j < ContractDims; j++) {
+        eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first &&
+                     eval_op_indices[j].second != eval_op_indices[i].second && "contraction axes should be unique");
+        if (eval_op_indices[j].first < eval_op_indices[i].first) {
+          numext::swap(eval_op_indices[j], eval_op_indices[i]);
+        }
+      }
+    }
+
+    array<Index, LDims> lhs_strides;
+    lhs_strides[0] = 1;
+    for (int i = 0; i < LDims - 1; ++i) {
+      lhs_strides[i + 1] = lhs_strides[i] * eval_left_dims[i];
+    }
+
+    array<Index, RDims> rhs_strides;
+    rhs_strides[0] = 1;
+    for (int i = 0; i < RDims - 1; ++i) {
+      rhs_strides[i + 1] = rhs_strides[i] * eval_right_dims[i];
+    }
+
+    if (m_i_strides.size() > 0) m_i_strides[0] = 1;
+    if (m_j_strides.size() > 0) m_j_strides[0] = 1;
+    if (m_k_strides.size() > 0) m_k_strides[0] = 1;
+
+    m_i_size = 1;
+    m_j_size = 1;
+    m_k_size = 1;
+
+    // To compute the dimension, we simply concatenate the non-contracting
+    // dimensions of the left and then the right tensor. Additionally, we also
+    // compute the strides corresponding to the left non-contracting
+    // dimensions and right non-contracting dimensions.
+    m_lhs_inner_dim_contiguous = true;
+    int dim_idx = 0;
+    Index nocontract_idx = 0;
+
+    for (int i = 0; i < LDims; i++) {
+      // find if we are contracting on index i of left tensor
+      bool contracting = false;
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].first == i) {
+          contracting = true;
+          break;
+        }
+      }
+      if (!contracting) {
+        // add dimension size to output dimensions
+        m_dimensions[dim_idx] = eval_left_dims[i];
+        m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
+        if (dim_idx != i) {
+          m_lhs_inner_dim_contiguous = false;
+        }
+        if (nocontract_idx + 1 < internal::array_size<left_nocontract_t>::value) {
+          m_i_strides[nocontract_idx + 1] = m_i_strides[nocontract_idx] * eval_left_dims[i];
+        } else {
+          m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
+        }
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    nocontract_idx = 0;
+    for (int i = 0; i < RDims; i++) {
+      bool contracting = false;
+      // find if we are contracting on index i of right tensor
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].second == i) {
+          contracting = true;
+          break;
+        }
+      }
+      if (!contracting) {
+        m_dimensions[dim_idx] = eval_right_dims[i];
+        if (nocontract_idx + 1 < internal::array_size<right_nocontract_t>::value) {
+          m_j_strides[nocontract_idx + 1] = m_j_strides[nocontract_idx] * eval_right_dims[i];
+        } else {
+          m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
+        }
+        m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    // Now compute the strides corresponding to the contracting dimensions. We
+    // assumed above that non-contracting axes are represented in the same order
+    // in the matrix as they are in the tensor. This is not the case for
+    // contracting axes. As the contracting axes must be of the same size in
+    // each tensor, we'll only look at the first tensor here.
+    m_rhs_inner_dim_contiguous = true;
+    m_rhs_inner_dim_reordered = false;
+    for (int i = 0; i < ContractDims; i++) {
+      Index left = eval_op_indices[i].first;
+      Index right = eval_op_indices[i].second;
+
+      Index size = eval_left_dims[left];
+      eigen_assert(size == eval_right_dims[right] && "Contraction axes must be same size");
+
+      if (i + 1 < static_cast<int>(internal::array_size<contract_t>::value)) {
+        m_k_strides[i + 1] = m_k_strides[i] * size;
+      } else {
+        m_k_size = m_k_strides[i] * size;
+      }
+      m_left_contracting_strides[i] = lhs_strides[left];
+      m_right_contracting_strides[i] = rhs_strides[right];
+
+      if (i > 0 && right < eval_op_indices[i - 1].second) {
+        m_rhs_inner_dim_reordered = true;
+      }
+      if (right != i) {
+        m_rhs_inner_dim_contiguous = false;
+      }
+    }
+
+    // If the layout is RowMajor, we need to reverse the m_dimensions
+    if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
+      for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
+        numext::swap(m_dimensions[i], m_dimensions[j]);
+      }
+    }
+
+    // A set of parameters that will allow output kernel to get from output
+    // tensor dimensions (i, j) into the original tensor dimensions.
+    // TODO(ezhulenev): Add parameters required to infer output tensor index for
+    // more complex contractions than 2x2 on internal dimension.
+    m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+        if (dest) {
+          evalToAsync(dest, [done]() { done(false); });
+        } else {
+          m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+          evalToAsync(m_result, [done]() { done(true); });
+        }
+      });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+#ifndef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
+  if (this->m_lhs_inner_dim_contiguous) {                    \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, true, true, ALIGNMENT> ARGS;            \
+      } else {                                               \
+        METHOD<true, true, false, ALIGNMENT> ARGS;           \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, false, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<true, false, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    }                                                        \
+  } else {                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, true, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<false, true, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, false, true, ALIGNMENT> ARGS;          \
+      } else {                                               \
+        METHOD<false, false, false, ALIGNMENT> ARGS;         \
+      }                                                      \
+    }                                                        \
+  }
+#endif
+
+#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH
+#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \
+  if (this->m_lhs_inner_dim_contiguous) {                                    \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, true, true, ALIGNMENT> ARGS)->FN;            \
+      } else {                                                               \
+        (new METHOD<DONE, true, true, false, ALIGNMENT> ARGS)->FN;           \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, false, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, true, false, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    }                                                                        \
+  } else {                                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, true, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, false, true, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, false, true, ALIGNMENT> ARGS)->FN;          \
+      } else {                                                               \
+        (new METHOD<DONE, false, false, false, ALIGNMENT> ARGS)->FN;         \
+      }                                                                      \
+    }                                                                        \
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+    static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalToCallback>
+  void evalToAsync(Scalar* buffer, EvalToCallback done) const {
+    static_cast<const Derived*>(this)->template evalProductAsync<EvalToCallback, Unaligned>(buffer, std::move(done));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalProductSequential(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(
+          buffer);
+    } else {
+      this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(
+          buffer);
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+#if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+#endif
+      void
+      evalGemv(Scalar* buffer) const {
+    const Index rows = m_i_size;
+    const Index cols = m_k_size;
+
+    typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
+    typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+    const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
+    const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size, lhs_inner_dim_contiguous, false,
+                                                   lhs_alignment>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size, rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, rhs_alignment>
+        RhsMapper;
+
+    LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
+    RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
+
+    const Scalar alpha(1);
+    const Index resIncr(1);
+
+    // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
+    m_device.fill(buffer, buffer + rows, Scalar(0));
+
+    internal::general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, false, RhsScalar, RhsMapper,
+                                            false>::run(rows, cols, lhs, rhs, buffer, resIncr, alpha);
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, static_cast<Index>(0),
+                    static_cast<Index>(0), rows, static_cast<Index>(1));
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+#if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+#endif
+      void
+      evalGemm(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    this->template evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                                   Alignment, true>(buffer, 0, k, 1);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(Scalar* buffer, Index k_start, Index k_end,
+                                                            int num_threads) const {
+    evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment,
+                    /*use_output_kernel*/ false>(buffer, k_start, k_end, num_threads);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment,
+            bool use_output_kernel>
+  EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+    eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size);
+    // columns in slice on left side, rows on right side
+    const Index k_slice = k_end - k_start;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // define data mappers for Lhs and Rhs
+    typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
+    typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
+
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size, lhs_inner_dim_contiguous, false,
+                                                   Unaligned>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size, rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    typedef internal::TensorContractionKernel<Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    // Sizes of the blocks to load in cache. See the Goto paper for details.
+    internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(
+        k_slice, m, n, num_threads);
+    const Index kc = blocking.kc();
+    const Index mc = numext::mini(m, blocking.mc());
+    const Index nc = numext::mini(n, blocking.nc());
+
+    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+
+    LhsBlock blockA;
+    RhsBlock blockB;
+
+    TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc);
+
+    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+    const BlockMemHandle packed_mem = kernel.allocate(this->m_device, &blockA, &blockB);
+
+    // If a contraction kernel does not support beta, explicitly initialize
+    // output buffer with zeroes.
+    if (!TensorContractionKernel::HasBeta) {
+      this->m_device.fill(buffer, buffer + m * n, Scalar(0));
+    }
+
+    for (Index i2 = 0; i2 < m; i2 += mc) {
+      const Index actual_mc = numext::mini(i2 + mc, m) - i2;
+      for (Index k2 = k_start; k2 < k_end; k2 += kc) {
+        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+        const Index actual_kc = numext::mini(k2 + kc, k_end) - k2;
+        kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        // If kernel supports beta, there is no need to initialize output
+        // buffer with zeroes.
+        const Scalar alpha = Scalar(1);
+        const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start) ? Scalar(0) : Scalar(1);
+
+        // series of horizontal blocks
+        for (Index j2 = 0; j2 < n; j2 += nc) {
+          // make sure we don't overshoot right edge of right matrix, then pack block
+          const Index actual_nc = numext::mini(j2 + nc, n) - j2;
+          kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc);
+
+          // call gebp (matrix kernel)
+          // The parameters here are copied from Eigen's GEMM implementation
+          const OutputMapper output_mapper = output.getSubMapper(i2, j2);
+          kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc, alpha, beta);
+
+          // We are done with this [i2, j2] output block.
+          if (use_output_kernel && k2 + kc >= k_end) {
+            m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, actual_mc, actual_nc);
+          }
+        }
+      }
+    }
+
+    kernel.deallocate(this->m_device, packed_mem);
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
+
+ protected:
+  Dimensions m_dimensions;
+
+  contract_t m_k_strides;
+  contract_t m_left_contracting_strides;
+  contract_t m_right_contracting_strides;
+
+  bool m_lhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_reordered;
+
+  left_nocontract_t m_i_strides;
+  right_nocontract_t m_j_strides;
+  left_nocontract_t m_left_nocontract_strides;
+  right_nocontract_t m_right_nocontract_strides;
+
+  Index m_i_size;
+  Index m_j_size;
+  Index m_k_size;
+
+  TensorContractionParams m_tensor_contraction_params;
+
+  TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
+  TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
+  const Device EIGEN_DEVICE_REF m_device;
+  OutputKernelType m_output_kernel;
+  EvaluatorPointerType m_result;
+};
+
+// evaluator for default device
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device>
+    : public TensorContractionEvaluatorBase<
+          TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device>> {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef std::conditional_t<Layout == static_cast<int>(ColMajor), LeftArgType, RightArgType> EvalLeftArgType;
+  typedef std::conditional_t<Layout == static_cast<int>(ColMajor), RightArgType, LeftArgType> EvalRightArgType;
+
+  static constexpr int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static constexpr int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static constexpr int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
+
+  // Could we use NumDimensions here?
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  template <int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer));
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
new file mode 100644
index 00000000000..7fbe30a9f9f
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+enum { ShardByRow = 0, ShardByCol = 1 };
+
+// Default Blocking Strategy
+template <typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex,
+          int ShardingType = ShardByCol>
+class TensorContractionBlocking {
+ public:
+  /*
+    adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
+      requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
+      which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in
+    `GeneralBlockPanelKernel.h` which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in
+    `GeneralBlockPanelKernel.h` (else HIPCC will error out)
+
+    However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+    results in NVCC erroring out with the following error
+
+    ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
+       dynamic initialization is not supported for function-scope static variables within a __device__/__global__
+    function
+  */
+
+#if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+#endif
+  TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
+    if (ShardingType == ShardByCol) {
+      computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads);
+    } else {
+      computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
+    }
+
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+
+ private:
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
new file mode 100644
index 00000000000..dbea8aa92af
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -0,0 +1,7 @@
+
+#if defined(__clang__) || defined(__GNUC__)
+#warning \
+    "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
+#endif
+
+#include "TensorContractionGpu.h"
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
new file mode 100644
index 00000000000..780e8961eae
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -0,0 +1,1387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Scalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper,
+          bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                                                                   const OutputMapper output, Scalar* lhs_shmem,
+                                                                   Scalar* rhs_shmem, const Index m_size,
+                                                                   const Index n_size, const Index k_size) {
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // threadIdx.x: the vertical position in an 8x8 block
+  // threadIdx.y: the vertical index of the 8x8 block in the grid
+  // threadIdx.z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                         \
+  {                                                           \
+    lhs_pf0 = conv(0);                                        \
+    lhs_pf1 = conv(0);                                        \
+    lhs_pf2 = conv(0);                                        \
+    lhs_pf3 = conv(0);                                        \
+    lhs_pf4 = conv(0);                                        \
+    lhs_pf5 = conv(0);                                        \
+    lhs_pf6 = conv(0);                                        \
+    lhs_pf7 = conv(0);                                        \
+                                                              \
+    rhs_pf0 = conv(0);                                        \
+    rhs_pf1 = conv(0);                                        \
+    rhs_pf2 = conv(0);                                        \
+    rhs_pf3 = conv(0);                                        \
+    rhs_pf4 = conv(0);                                        \
+    rhs_pf5 = conv(0);                                        \
+    rhs_pf6 = conv(0);                                        \
+    rhs_pf7 = conv(0);                                        \
+                                                              \
+    if (!needs_edge_check || lhs_vert < m_size) {             \
+      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \
+      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \
+      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \
+      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \
+      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \
+      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \
+      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \
+      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \
+                                                              \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                 \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                 \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                 \
+      } else if (lhs_horiz_6 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                 \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                 \
+      } else if (lhs_horiz_5 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                 \
+      } else if (lhs_horiz_4 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
+      } else if (lhs_horiz_3 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
+      } else if (lhs_horiz_2 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+      } else if (lhs_horiz_1 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+      } else if (lhs_horiz_0 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+      }                                                       \
+    }                                                         \
+                                                              \
+    const Index rhs_vert = base_k + load_idx_vert;            \
+    if (!needs_edge_check || rhs_vert < k_size) {             \
+      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \
+      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \
+      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \
+      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \
+      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \
+      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \
+      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \
+      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \
+                                                              \
+      if (rhs_horiz_7 < n_size) {                             \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                 \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                 \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                 \
+      } else if (rhs_horiz_6 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                 \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                 \
+      } else if (rhs_horiz_5 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                 \
+      } else if (rhs_horiz_4 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
+      } else if (rhs_horiz_3 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
+      } else if (rhs_horiz_2 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+      } else if (rhs_horiz_1 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+      } else if (rhs_horiz_0 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+      }                                                       \
+    }                                                         \
+  }
+
+#define writeRegToShmem()               \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0; \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0; \
+                                        \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1; \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1; \
+                                        \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2; \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2; \
+                                        \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3; \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3; \
+                                        \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4; \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4; \
+                                        \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5; \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5; \
+                                        \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6; \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6; \
+                                        \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7; \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)      \
+  Scalar res(i, 0) = conv(0); \
+  Scalar res(i, 1) = conv(0); \
+  Scalar res(i, 2) = conv(0); \
+  Scalar res(i, 3) = conv(0); \
+  Scalar res(i, 4) = conv(0); \
+  Scalar res(i, 5) = conv(0); \
+  Scalar res(i, 6) = conv(0); \
+  Scalar res(i, 7) = conv(0);
+
+  internal::scalar_cast_op<int, Scalar> conv;
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+#undef prefetchIntoRegisters
+#undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)         \
+  lcol(0) = lhs_element(0, j); \
+  rrow(0) = rhs_element(i, 0); \
+  lcol(1) = lhs_element(1, j); \
+  rrow(1) = rhs_element(i, 1); \
+  lcol(2) = lhs_element(2, j); \
+  rrow(2) = rhs_element(i, 2); \
+  lcol(3) = lhs_element(3, j); \
+  rrow(3) = rhs_element(i, 3); \
+  lcol(4) = lhs_element(4, j); \
+  rrow(4) = rhs_element(i, 4); \
+  lcol(5) = lhs_element(5, j); \
+  rrow(5) = rhs_element(i, 5); \
+  lcol(6) = lhs_element(6, j); \
+  rrow(6) = rhs_element(i, 6); \
+  lcol(7) = lhs_element(7, j); \
+  rrow(7) = rhs_element(i, 7);
+
+#define computeCol(j)             \
+  res(0, j) += lcol(0) * rrow(j); \
+  res(1, j) += lcol(1) * rrow(j); \
+  res(2, j) += lcol(2) * rrow(j); \
+  res(3, j) += lcol(3) * rrow(j); \
+  res(4, j) += lcol(4) * rrow(j); \
+  res(5, j) += lcol(5) * rrow(j); \
+  res(6, j) += lcol(6) * rrow(j); \
+  res(7, j) += lcol(7) * rrow(j);
+
+#define computePass(i) \
+  loadData(i, i);      \
+                       \
+  computeCol(0);       \
+  computeCol(1);       \
+  computeCol(2);       \
+  computeCol(3);       \
+  computeCol(4);       \
+  computeCol(5);       \
+  computeCol(6);       \
+  computeCol(7);
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  }  // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
+
+#define reduceRow(i, mask) \
+  shuffleInc(i, 0, mask);  \
+  shuffleInc(i, 1, mask);  \
+  shuffleInc(i, 2, mask);  \
+  shuffleInc(i, 3, mask);  \
+  shuffleInc(i, 4, mask);  \
+  shuffleInc(i, 5, mask);  \
+  shuffleInc(i, 6, mask);  \
+  shuffleInc(i, 7, mask);
+
+#define reduceMatrix(mask) \
+  reduceRow(0, mask);      \
+  reduceRow(1, mask);      \
+  reduceRow(2, mask);      \
+  reduceRow(3, mask);      \
+  reduceRow(4, mask);      \
+  reduceRow(5, mask);      \
+  reduceRow(6, mask);      \
+  reduceRow(7, mask);
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j) lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j);
+
+#define writeRow(i)       \
+  writeResultShmem(i, 0); \
+  writeResultShmem(i, 1); \
+  writeResultShmem(i, 2); \
+  writeResultShmem(i, 3); \
+  writeResultShmem(i, 4); \
+  writeResultShmem(i, 5); \
+  writeResultShmem(i, 6); \
+  writeResultShmem(i, 7);
+
+  if (threadIdx.x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+  if (threadIdx.x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+template <typename Scalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(512)
+#endif
+    EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size,
+                           const Index n_size, const Index k_size) {
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(
+        lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(
+        lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+          bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                                                                         const OutputMapper output,
+                                                                         float2 lhs_shmem2[][16],
+                                                                         float2 rhs_shmem2[][8], const Index m_size,
+                                                                         const Index n_size, const Index k_size,
+                                                                         const Index base_m, const Index base_n) {
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i = 0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+#define prefetch_lhs(reg, row, col)                                 \
+  if (!CHECK_LHS_BOUNDARY) {                                        \
+    if (col < k_size) {                                             \
+      reg = lhs.template loadPacket<float4, Unaligned>(row, col);   \
+    }                                                               \
+  } else {                                                          \
+    if (col < k_size) {                                             \
+      if (row + 3 < m_size) {                                       \
+        reg = lhs.template loadPacket<float4, Unaligned>(row, col); \
+      } else if (row + 2 < m_size) {                                \
+        reg.x = lhs(row + 0, col);                                  \
+        reg.y = lhs(row + 1, col);                                  \
+        reg.z = lhs(row + 2, col);                                  \
+      } else if (row + 1 < m_size) {                                \
+        reg.x = lhs(row + 0, col);                                  \
+        reg.y = lhs(row + 1, col);                                  \
+      } else if (row < m_size) {                                    \
+        reg.x = lhs(row + 0, col);                                  \
+      }                                                             \
+    }                                                               \
+  }
+
+  Index lhs_vert = base_m + threadIdx.x * 4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf0 = internal::pset1<float4>(0);
+
+    Index lhs_horiz = threadIdx.y + k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+        Index rhs_vert = k + (threadIdx.x % 4) * 4;
+    Index rhs_horiz0 = (threadIdx.x >> 2) + threadIdx.y * 4 + base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          rhs_pf0 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2;
+    // the following can be a bitwise operation..... some day.
+    if ((threadIdx.x % 8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+#else
+    x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+    x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+#endif
+    if ((threadIdx.x % 8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(threadIdx.x >> 3) + threadIdx.y * 2][threadIdx.x % 8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(threadIdx.x >> 3) + threadIdx.y * 2 + 32][threadIdx.x % 8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y + 16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+#define add_vals(fl1, fl2, fr1, fr2) \
+  results[0].x += fl1.x * fr1.x;     \
+  results[0].y += fl1.y * fr1.x;     \
+  results[0].z += fl2.x * fr1.x;     \
+  results[0].w += fl2.y * fr1.x;     \
+                                     \
+  results[1].x += fl1.x * fr1.y;     \
+  results[1].y += fl1.y * fr1.y;     \
+  results[1].z += fl2.x * fr1.y;     \
+  results[1].w += fl2.y * fr1.y;     \
+                                     \
+  results[2].x += fl1.x * fr2.x;     \
+  results[2].y += fl1.y * fr2.x;     \
+  results[2].z += fl2.x * fr2.x;     \
+  results[2].w += fl2.y * fr2.x;     \
+                                     \
+  results[3].x += fl1.x * fr2.y;     \
+  results[3].y += fl1.y * fr2.y;     \
+  results[3].z += fl2.x * fr2.y;     \
+  results[3].w += fl2.y * fr2.y;
+
+    __syncthreads();
+
+// Do the multiplies.
+#pragma unroll
+    for (int koff = 0; koff < 16; koff++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+      int start_feature = threadIdx.y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature >> 1) + 32 * ((koff % 4) / 2)][koff / 4 + (koff % 2) * 4];
+      float2 fr2 = rhs_shmem2[(start_feature >> 1) + 1 + 32 * ((koff % 4) / 2)][koff / 4 + (koff % 2) * 4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = threadIdx.y * 4 + base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size) output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size) output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size) output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size) output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+          bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                                                                    const OutputMapper output, float2 lhs_shmem2[][32],
+                                                                    float2 rhs_shmem2[][8], const Index m_size,
+                                                                    const Index n_size, const Index k_size,
+                                                                    const Index base_m, const Index base_n) {
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i = 0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+  Index lhs_vert = base_m + threadIdx.x * 4 + (threadIdx.y % 4) * 32;
+  for (Index k = 0; k < k_size; k += 32) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);
+
+    if (!CHECK_LHS_BOUNDARY) {
+      if ((threadIdx.y / 4 + k + 24) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+        lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
+        lhs_pf3 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 24));
+      } else if ((threadIdx.y / 4 + k + 16) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+        lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
+      } else if ((threadIdx.y / 4 + k + 8) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+      } else if ((threadIdx.y / 4 + k) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((threadIdx.y / 4 + k + 24) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+          lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
+          lhs_pf3 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 24));
+        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+          lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
+        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+        } else if ((threadIdx.y / 4 + k) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((threadIdx.y / 4 + k + 24) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 16));
+          lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24));
+          lhs_pf3.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 24));
+          lhs_pf3.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 24));
+        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 16));
+        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8));
+        } else if ((threadIdx.y / 4 + k) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((threadIdx.y / 4 + k + 24) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
+          lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24));
+          lhs_pf3.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 24));
+        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
+        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+        } else if ((threadIdx.y / 4 + k) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((threadIdx.y / 4 + k + 24) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+          lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24));
+        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+        } else if ((threadIdx.y / 4 + k) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k + threadIdx.x * 4;
+    Index rhs_horiz0 = threadIdx.y * 2 + base_n;
+    Index rhs_horiz1 = threadIdx.y * 2 + 1 + base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k + threadIdx.x * 4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k + threadIdx.x * 4 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[threadIdx.y + 32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[threadIdx.y + 64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[threadIdx.y + 96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4) \
+  results[0].x += a_feat1.x * f1.x;                \
+  results[1].x += a_feat1.x * f1.y;                \
+  results[2].x += a_feat1.x * f2.x;                \
+  results[3].x += a_feat1.x * f2.y;                \
+  results[4].x += a_feat1.x * f3.x;                \
+  results[5].x += a_feat1.x * f3.y;                \
+  results[6].x += a_feat1.x * f4.x;                \
+  results[7].x += a_feat1.x * f4.y;                \
+                                                   \
+  results[0].y += a_feat1.y * f1.x;                \
+  results[1].y += a_feat1.y * f1.y;                \
+  results[2].y += a_feat1.y * f2.x;                \
+  results[3].y += a_feat1.y * f2.y;                \
+  results[4].y += a_feat1.y * f3.x;                \
+  results[5].y += a_feat1.y * f3.y;                \
+  results[6].y += a_feat1.y * f4.x;                \
+  results[7].y += a_feat1.y * f4.y;                \
+                                                   \
+  results[0].z += a_feat2.x * f1.x;                \
+  results[1].z += a_feat2.x * f1.y;                \
+  results[2].z += a_feat2.x * f2.x;                \
+  results[3].z += a_feat2.x * f2.y;                \
+  results[4].z += a_feat2.x * f3.x;                \
+  results[5].z += a_feat2.x * f3.y;                \
+  results[6].z += a_feat2.x * f4.x;                \
+  results[7].z += a_feat2.x * f4.y;                \
+                                                   \
+  results[0].w += a_feat2.y * f1.x;                \
+  results[1].w += a_feat2.y * f1.y;                \
+  results[2].w += a_feat2.y * f2.x;                \
+  results[3].w += a_feat2.y * f2.y;                \
+  results[4].w += a_feat2.y * f3.x;                \
+  results[5].w += a_feat2.y * f3.y;                \
+  results[6].w += a_feat2.y * f4.x;                \
+  results[7].w += a_feat2.y * f4.y;
+
+    lhs_shmem2[threadIdx.y / 4][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y / 4 + 8][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[threadIdx.y / 4 + 16][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[threadIdx.y / 4 + 24][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[threadIdx.y / 4 + 32][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[threadIdx.y / 4 + 40][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[threadIdx.y / 4 + 48][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[threadIdx.y / 4 + 56][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+// Do the multiplies.
+#pragma unroll
+    for (int koff = 0; koff < 32; koff++) {
+      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+      int start_feature = (threadIdx.y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature / 2 + (koff % 4) * 32][koff / 4];
+      float2 br2 = rhs_shmem2[start_feature / 2 + 1 + (koff % 4) * 32][koff / 4];
+      float2 br3 = rhs_shmem2[start_feature / 2 + 2 + (koff % 4) * 32][koff / 4];
+      float2 br4 = rhs_shmem2[start_feature / 2 + 3 + (koff % 4) * 32][koff / 4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  }  // end loop over k
+
+#undef add_vals
+
+  __syncthreads();
+  Index horiz_base = (threadIdx.y / 4) * 8 + base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size) output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size) output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size) output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size) output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+    EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size,
+                                const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64 * 32];
+  __shared__ float2 rhs_shmem[128 * 8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  bool check_rhs = (base_n + 63) >= n_size;
+  bool check_lhs128 = (base_m + 127) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+          lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+          lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+          lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+          lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+    EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output,
+                                     const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+          lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+          lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+          lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+          lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice>
+    : public TensorContractionEvaluatorBase<TensorEvaluator<
+          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef std::conditional_t<Layout == static_cast<int>(ColMajor), LeftArgType, RightArgType> EvalLeftArgType;
+  typedef std::conditional_t<Layout == static_cast<int>(ColMajor), RightArgType, LeftArgType> EvalRightArgType;
+
+  static constexpr int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static constexpr int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static constexpr int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
+  typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {
+    EIGEN_STATIC_ASSERT((internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
+                        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
+  }
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    } else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper,
+            typename OutputMapper>
+  struct LaunchKernels {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k,
+                    const GpuDevice& device) {
+      const Index m_blocks = (m + 63) / 64;
+      const Index n_blocks = (n + 63) / 64;
+      const dim3 num_blocks(m_blocks, n_blocks, 1);
+      const dim3 block_size(8, 8, 8);
+      LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks,
+                        block_size, 0, device, lhs, rhs, output, m, n, k);
+    }
+  };
+
+  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper>
+  struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k,
+                    const GpuDevice& device) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks,
+                          block_size, 0, device, lhs, rhs, output, m, n, k);
+      } else {
+        const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks,
+                          block_size, 0, device, lhs, rhs, output, m, n, k);
+      }
+    }
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar))
+    this->m_device.fill(buffer, buffer + m * n, Scalar(0));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4, lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4, rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                                                   Unaligned>
+        RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+#if defined(EIGEN_USE_HIP)
+    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
+#else
+    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
+#endif
+
+    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k,
+                                                                                        this->m_device);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_USE_GPU and EIGEN_GPUCC
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
new file mode 100644
index 00000000000..6367db96409
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -0,0 +1,533 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+enum { Rhs = 0, Lhs = 1 };
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the
+/// default make pointer is used which is scalar * for CoeffLoader.
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer>
+struct CoeffLoader;
+
+template <typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_ = MakePointer>
+class BaseTensorContractionMapper;
+
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_>
+struct CoeffLoader {
+  enum { DirectOffsets = false };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
+    eigen_assert(false && "unsupported");
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type data() const {
+    eigen_assert(false && "unsupported");
+    return NULL;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const {
+    return m_tensor.coeff(index);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Tensor::PacketReturnType packet(typename Tensor::Index index) const {
+    return m_tensor.template packet<LoadMode>(index);
+  }
+
+ private:
+  const Tensor m_tensor;
+};
+
+template <typename Tensor, template <class> class MakePointer_>
+struct CoeffLoader<Tensor, true, MakePointer_> {
+  enum { DirectOffsets = true };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { m_data += offset; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type data() const {
+    return m_data;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const {
+    return loadConstant(m_data + index);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Tensor::PacketReturnType packet(typename Tensor::Index index) const {
+    return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
+  }
+
+ private:
+  typedef typename Tensor::Scalar Scalar;
+
+  typename MakePointer_<const Scalar>::Type m_data;
+};
+
+template <typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
+class SimpleTensorContractionMapper {
+ public:
+  EIGEN_DEVICE_FUNC SimpleTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
+                                                  const nocontract_t& ij_strides, const contract_t& contract_strides,
+                                                  const contract_t& k_strides)
+      : m_tensor(tensor),
+        m_nocontract_strides(nocontract_strides),
+        m_ij_strides(ij_strides),
+        m_contract_strides(contract_strides),
+        m_k_strides(k_strides) {}
+
+  enum { DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+    m_tensor.offsetBuffer(offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void prefetch(Index /*i*/) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+    // column major assumption
+    return operator()(row, 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+    return m_tensor.coeff(computeIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+    const bool left = (side == Lhs);
+    EIGEN_UNUSED_VARIABLE(left);  // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
+    Index nocontract_val = left ? row : col;
+    Index linidx = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+      const Index idx = nocontract_val / m_ij_strides[i];
+      linidx += idx * m_nocontract_strides[i];
+      nocontract_val -= idx * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx += nocontract_val;
+      } else {
+        linidx += nocontract_val * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val = left ? col : row;
+    if (array_size<contract_t>::value > 0) {
+      EIGEN_UNROLL_LOOP
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx = contract_val / m_k_strides[i];
+        linidx += idx * m_contract_strides[i];
+        contract_val -= idx * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx += contract_val;
+      } else {
+        linidx += contract_val * m_contract_strides[0];
+      }
+    }
+
+    return linidx;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col,
+                                                                          const Index distance) const {
+    const bool left = (side == Lhs);
+    EIGEN_UNUSED_VARIABLE(left);  // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
+    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+    Index linidx[2] = {0, 0};
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      EIGEN_UNROLL_LOOP
+      for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+        const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+        linidx[0] += idx0 * m_nocontract_strides[i];
+        linidx[1] += idx1 * m_nocontract_strides[i];
+        nocontract_val[0] -= idx0 * m_ij_strides[i];
+        nocontract_val[1] -= idx1 * m_ij_strides[i];
+      }
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx[0] += nocontract_val[0];
+        linidx[1] += nocontract_val[1];
+      } else {
+        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+    if (array_size<contract_t>::value > 0) {
+      EIGEN_UNROLL_LOOP
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = contract_val[0] / m_k_strides[i];
+        const Index idx1 = contract_val[1] / m_k_strides[i];
+        linidx[0] += idx0 * m_contract_strides[i];
+        linidx[1] += idx1 * m_contract_strides[i];
+        contract_val[0] -= idx0 * m_k_strides[i];
+        contract_val[1] -= idx1 * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx[0] += contract_val[0];
+        linidx[1] += contract_val[1];
+      } else {
+        linidx[0] += contract_val[0] * m_contract_strides[0];
+        linidx[1] += contract_val[1] * m_contract_strides[0];
+      }
+    }
+    return IndexPair<Index>(linidx[0], linidx[1]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
+    // Only claim alignment when we can compute the actual stride (ie when we're
+    // dealing with the lhs with inner_dim_contiguous. This is because the
+    // matrix-vector product relies on the stride when dealing with aligned inputs.
+    return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
+    return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
+  }
+
+  const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const { return m_tensor; }
+
+  const nocontract_t& nocontract_strides() const { return m_nocontract_strides; }
+  const nocontract_t& ij_strides() const { return m_ij_strides; }
+  const contract_t& contract_strides() const { return m_contract_strides; }
+  const contract_t& k_strides() const { return m_k_strides; }
+
+ protected:
+  CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
+  const nocontract_t m_nocontract_strides;
+  const nocontract_t m_ij_strides;
+  const contract_t m_contract_strides;
+  const contract_t m_k_strides;
+};
+
+template <typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_>
+class BaseTensorContractionMapper
+    : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                           inner_dim_contiguous, Alignment, MakePointer_> {
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                        inner_dim_contiguous, Alignment, MakePointer_>
+      ParentMapper;
+
+  EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
+                                                const nocontract_t& ij_strides, const contract_t& contract_strides,
+                                                const contract_t& k_strides)
+      : ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {}
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      std::enable_if_t<internal::unpacket_traits<PacketT>::size == packet_size, PacketT>
+      load(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    // current code assumes packet size must be a multiple of 2
+    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+      const Index index = this->computeIndex(i, j);
+      eigen_assert(this->computeIndex(i + packet_size - 1, j) == index + packet_size - 1);
+      return this->m_tensor.template packet<AlignmentType>(index);
+    }
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+    const Index first = indexPair.first;
+    const Index lastIdx = indexPair.second;
+
+    // We can always do optimized packet reads from left hand side right now, because
+    // the vertical matrix dimension on the left hand side is never contracting.
+    // On the right hand side we need to check if the contracting dimensions may have
+    // been shuffled first.
+    if (Tensor::PacketAccess && (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+        (lastIdx - first) == (packet_size - 1)) {
+      return this->m_tensor.template packet<AlignmentType>(first);
+    }
+
+    EIGEN_ALIGN_MAX Scalar data[packet_size];
+
+    data[0] = this->m_tensor.coeff(first);
+    EIGEN_UNROLL_LOOP
+    for (Index k = 1; k < packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+    return pload<PacketT>(data);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      std::enable_if_t<internal::unpacket_traits<PacketT>::size != packet_size, PacketT>
+      load(Index i, Index j) const {
+    const Index requested_packet_size = internal::unpacket_traits<PacketT>::size;
+    EIGEN_ALIGN_MAX Scalar data[requested_packet_size];
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1);
+    const Index first = indexPair.first;
+    const Index lastIdx = indexPair.second;
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < requested_packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+    return pload<PacketT>(data);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+    return this->load<PacketT, AlignmentType>(i, j);
+  }
+};
+
+template <typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous,
+                                  inner_dim_reordered, Alignment, MakePointer_>
+    : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1,
+                                           inner_dim_contiguous, Alignment, MakePointer_> {
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous,
+                                        Alignment, MakePointer_>
+      ParentMapper;
+
+  EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
+                                                const nocontract_t& ij_strides, const contract_t& contract_strides,
+                                                const contract_t& k_strides)
+      : ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {}
+
+  template <typename PacketT, int>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<PacketT>(data);
+  }
+  template <typename PacketT, int>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<PacketT>(data);
+  }
+};
+
+template <typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_ = MakePointer>
+class TensorContractionSubMapper {
+ public:
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                      inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+      ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                     inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+      Self;
+  typedef Self LinearMapper;
+  typedef Self SubMapper;
+
+  enum {
+    // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
+    // TODO: we should also enable direct offsets for the Rhs case.
+    UseDirectOffsets =
+        ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0)
+  };
+
+  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
+    // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
+    // this offset every time we attempt to access a coefficient.
+    if (UseDirectOffsets) {
+      Index stride = m_base_mapper.stride();
+      m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, 0);
+    }
+    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, j);
+    }
+    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<PacketT, Alignment>(i, 0);
+    }
+    return m_base_mapper.template loadPacket<PacketT, Alignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<PacketT, Alignment>(i, j);
+    }
+    return m_base_mapper.template loadPacket<PacketT, Alignment>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacketPartial(Index i, Index j, Index, Index = 0) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<PacketT, Alignment>(i, j);
+    }
+    return m_base_mapper.template loadPacket<PacketT, Alignment>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template load<PacketT, AlignmentType>(i, j);
+    }
+    return m_base_mapper.template loadPacket<PacketT, AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const {
+    if (UseDirectOffsets) {
+      m_base_mapper.storePacket(i, 0, p);
+    }
+    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return LinearMapper(m_base_mapper, i, j);
+    }
+    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return SubMapper(m_base_mapper, i, j);
+    }
+    return SubMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const Index stride() const { return m_base_mapper.stride(); }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<PacketT, ActualAlignment>(i, 0);
+    }
+    return m_base_mapper.template loadPacket<PacketT, ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
+    return false;
+  }
+
+  const ParentMapper& base_mapper() const { return m_base_mapper; }
+  Index vert_offset() const { return m_vert_offset; }
+  Index horiz_offset() const { return m_horiz_offset; }
+
+ private:
+  ParentMapper m_base_mapper;
+  const Index m_vert_offset;
+  const Index m_horiz_offset;
+};
+
+template <typename Scalar_, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_ = MakePointer>
+class TensorContractionInputMapper
+    : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                         inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
+ public:
+  typedef Scalar_ Scalar;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                      inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+      Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                     inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+      SubMapper;
+  typedef SubMapper VectorMapper;
+  typedef SubMapper LinearMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
+                                                 const nocontract_t& ij_strides, const contract_t& contract_strides,
+                                                 const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
+    return Base::m_tensor;
+  }
+};
+
+template <typename T>
+struct TensorContractionInputMapperTrait;
+
+template <typename Scalar_, typename Index_, int side_, typename Tensor_, typename nocontract_t_, typename contract_t_,
+          int packet_size_, bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_,
+          template <class> class MakePointer_>
+struct TensorContractionInputMapperTrait<
+    TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, nocontract_t_, contract_t_, packet_size_,
+                                 inner_dim_contiguous_, inner_dim_reordered_, Alignment_, MakePointer_> > {
+  typedef Tensor_ XprType;
+  static const bool inner_dim_contiguous = inner_dim_contiguous_;
+  static const bool inner_dim_reordered = inner_dim_reordered_;
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
new file mode 100644
index 00000000000..c7203a9306e
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -0,0 +1,1654 @@
+// This file is part of Eigen, a lightweight C++ template library for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
+// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorContractionSycl.h
+ *
+ * \brief:
+ *  TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+/*!
+ * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector
+ * contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor/vector
+ *
+ * \tparam StorageIndex  determines the Index type.
+ *
+ * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group
+ *
+ * \tparam CFactor: determines the number of contracting element to be process by each thread
+ *
+ * \tparam NCFactor: determines the number of non-contracting element to be process by each thread
+ */
+template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
+struct TVPanelSize {
+  // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimNC: determines the tile size for the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
+  // TileSizeDimC: determines the tile size for the contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
+  // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
+  // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
+  // BC : determines if supporting bank conflict is required
+  static EIGEN_CONSTEXPR bool BC = false;
+};
+#endif
+
+/*!
+ * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor
+ contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor
+ *
+ * \tparam StorageIndex: determines the Index type.
+ *
+ * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro).
+ *
+ * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro).
+ *
+ * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered
+ */
+
+template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
+struct TTPanelSize {
+  // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
+  // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
+  // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
+#ifndef EIGEN_SYCL_REG_M
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
+#else
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
+#endif
+// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
+// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
+#ifndef EIGEN_SYCL_REG_N
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
+#else
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
+#endif
+  // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimM: determines the tile size for the m dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
+  // TileSizeDimN: determines the tile size for the n dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
+  // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisible by packetsize
+  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
+  // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisible by packetsize
+  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
+  // BC : determines if supporting bank conflict is required
+  static EIGEN_CONSTEXPR bool BC = true;
+  // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
+  // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory)
+  static EIGEN_CONSTEXPR bool DoubleBuffer =
+#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
+      false;
+#else
+      true;
+#endif
+};
+
+/* !
+ * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
+ * specialize the contraction algorithm based on device support for dedicated local memory.
+ */
+enum class contraction_type { local, no_local };
+/* !
+ * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
+ */
+enum class data_source { global_mem, local_mem, private_mem };
+
+/*!
+ * \brief read, a template function used for loading the data from global
+ memory. This function is used to guarantee coalesced and vectorized load whenever possible
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+ vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex is the contracting dim index
+ *
+ * \param ld: is the leading dimension of the flattened tensor
+ */
+template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
+          typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<PacketLoad, PacketType> read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
+  const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
+  const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
+  return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
+}
+
+/*!
+ * \brief read, special overload of read function, when the read access is not vectorized
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+  vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+  contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+  when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex: is the contracting dim index
+ */
+template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!PacketLoad, PacketType> read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
+  const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
+  const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
+  return tensorMapper(row, col);
+}
+
+/*!
+ * \brief write, a template function used for storing the data to local memory. This function is used to guarantee
+ * coalesced and vectorized store whenever possible.
+ *
+ * \tparam StorageIndex: determines the Index type
+ *
+ * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ *
+ * \param CIndex is the contracting dim index
+ */
+
+template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<dt != data_source::global_mem, void> write(
+    PacketType &packet_data, DataScalar ptr) {
+  EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < PacketSize; i++) {
+    *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
+    ptr += ld;
+  }
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function
+ * is used to guarantee coalesced and vectorized store whenever possible.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename std::enable_if_t<Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem,
+                              void>
+    write(PacketType &packet_data, DataScalar *ptr) {
+  ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename std::enable_if_t<Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem,
+                              void>
+    write(PacketType &packet_data, DataScalar *ptr) {
+  *ptr = packet_data;
+}
+
+/*!
+ * \brief check_boundary: is used to check the edge condition for non-internal blocks.
+ *
+ * \tparam is_internal: determines if the block is internal
+ */
+template <bool is_internal>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
+  return true;
+}
+
+/*!
+ * \brief check_boundary: specialization of the check_boundary for non-internal blocks.
+ *
+ * \param cond: true when the data is in range. Otherwise false
+ */
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
+  return cond;
+}
+
+/*!
+ * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed
+ * by each workgroup.
+ *
+ * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed
+ *
+ * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be
+ * packetType; Otherwise it will be scalar Type
+ *
+ * \param elements_per_access determines the size of each element based on OutType
+ *
+ * \param is_coalesced_layout  determines whether or not the Tensor data in a memory can be access coalesced and
+ * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ *
+ * \param c_stride  determines the stride of contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ */
+template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
+struct BlockProperties {
+  static EIGEN_CONSTEXPR bool packet_load = packet_load_;
+  typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
+  static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
+  typedef std::conditional_t<packet_load, PacketType, OutScalar> OutType;
+  static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
+  static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
+  static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
+  static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
+};
+
+/*!
+ * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup.  Please see
+ * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup,
+ * work-items
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \param linearLocalThreadId: determines the linearized location of a thread within a work-group
+ *
+ * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when
+ * tall/skinny algorithm is used
+ *
+ * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of
+ * the flattened tensor.
+ *
+ * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the
+ * flattened tensor. It will be > 1 when tall/skinny algorithm is used.
+ *
+ * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a
+ * flattened tensor
+ *
+ * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a
+ * flattened tensor
+ *
+ * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the
+ * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used.
+ *
+ * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or
+ * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be
+ * resolve by compiler.
+ */
+template <typename StorageIndex>
+struct ThreadProperties {
+  const StorageIndex linearLocalThreadId;
+  const StorageIndex kGroupId;
+  const StorageIndex mGroupOffset;
+  const StorageIndex nGroupOffset;
+  const StorageIndex kGroupOffset;
+  const StorageIndex mLocalOffset;
+  const StorageIndex nLocalOffset;
+  const StorageIndex mGlobalOffset;
+  const StorageIndex nGlobalOffset;
+  StorageIndex kSize;
+  const bool is_internal;
+  // this is used to adjust the last block
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
+      const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
+      const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
+      const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
+      StorageIndex kSize_, const bool is_internal_)
+      : linearLocalThreadId(linearLocalThreadId_),
+        kGroupId(kGroupId_),
+        mGroupOffset(mGroupOffset_),
+        nGroupOffset(nGroupOffset_),
+        kGroupOffset(kGroupOffset_),
+        mLocalOffset(mLocalOffset_),
+        nLocalOffset(nLocalOffset_),
+        mGlobalOffset(mGlobalOffset_),
+        nGlobalOffset(nGlobalOffset_),
+        kSize(kSize_),
+        is_internal(is_internal_) {}
+};
+
+/*!
+ * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory
+ access is used to guarantee that always the memory access are coalesced.
+ *
+ * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output.
+ Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny
+ contraction is used. So in this case, a final reduction step is required to compute final output.
+
+ * \tparam contraction_tp: it is an enum value representing whether the local memory/no local memory implementation of
+ the algorithm to be used
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param groupSizeM: a logical number determining the number of work-group for m dimension
+ *
+ * \param groupSizeN: a logical number determining the number of work-group for n dimension
+ *
+ * \param numTiles: determines total number of tiles on the k dimension
+ *
+ * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
+          typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
+class TensorContractionKernel {
+ public:
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static EIGEN_CONSTEXPR int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  static EIGEN_CONSTEXPR bool is_lhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
+  static EIGEN_CONSTEXPR bool is_rhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
+
+  typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
+                          PacketReturnType>
+      LHSBlockProperties;
+
+  typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
+                          PacketReturnType>
+      RHSBlockProperties;
+
+  static EIGEN_CONSTEXPR StorageIndex NStride =
+      contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
+
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
+  typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
+  typedef std::conditional_t<contraction_tp == contraction_type::local, local_ptr, private_ptr> tile_ptr;
+  static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
+                                                 ? Properties::TileSizeDimM + Properties::BC
+                                                 : Properties::WorkLoadPerThreadM;
+  static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
+                                                 ? Properties::TileSizeDimN + Properties::BC
+                                                 : Properties::WorkLoadPerThreadN;
+  static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+
+  /**
+   * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
+   * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to
+   * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting
+   * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out
+   * different type of memory needed when local/no_local memory computation is called.
+   *
+   * \tparam contraction_type: it is an enum value representing whether the local memory/no local memory implementation
+   of the algorithm to be used
+   * \tparam the private memory size
+   * \param ptr the tile memory pointer type
+   */
+  template <contraction_type, StorageIndex>
+  struct MemHolder {
+    tile_ptr ptr;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
+  };
+  /**
+   * \brief specialization of memHolder class when no local memory kernel is used.
+   */
+  template <StorageIndex MemSize>
+  struct MemHolder<contraction_type::no_local, MemSize> {
+    OutScalar ptr[MemSize] = {OutScalar{0}};
+  };
+  /**
+   * \brief TiledMemory: contains required memory pointer for loading  each tile of the TensorContraction panel from
+   * global memory to local/private memory when local/no_local algorithm used.
+   *
+   * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param lhs_extract_index: determines the position of each thread on a local memory for lhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param rhs_extract_index: determines the position of each thread on a local memory for rhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param lhs_scratch_compute : determines the  location to load for computation for lhs_local memory. This is the
+   * same as lhs_scratch_extract for private memory.
+   *
+   * \param rhs_scratch_compute : determines the  location to load for computation for rhs_local memory. This is the
+   * same as rhs_scratch_extract for private memory.
+   */
+  struct TiledMemory {
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
+    tile_ptr lhs_scratch_ptr_compute;
+    tile_ptr rhs_scratch_ptr_compute;
+    const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
+    const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
+                                                      std::enable_if_t<tp == contraction_type::no_local> * = 0)
+        : lhs_scratch_extract{},
+          rhs_scratch_extract{},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
+          lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
+          rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
+
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties<StorageIndex> &thread_properties,
+                                                      local_ptr block_start_ptr,
+                                                      std::enable_if_t<tp == contraction_type::local> * = 0)
+        : lhs_scratch_extract{block_start_ptr},
+          rhs_scratch_extract{lhs_scratch_extract.ptr +
+                              ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
+          lhs_extract_index(
+              local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
+          rhs_extract_index(
+              local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
+  };
+
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex groupSizeM;
+  const StorageIndex groupSizeN;
+  const StorageIndex numTiles;
+  const TripleDim triple_dim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex groupSizeN_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : scratch(scratch_),
+        lhs(lhs_),
+        rhs(rhs_),
+        out_res(out_res_),
+        groupSizeM(groupSizeM_),
+        groupSizeN(groupSizeN_),
+        numTiles(numTiles_),
+        triple_dim(triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
+    const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
+    const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
+    const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
+    const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
+    const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
+    const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
+    const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
+    const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
+    const StorageIndex nLocalOffset = NStride * nLocalThreadId;
+    const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
+    const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
+
+    const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
+    StorageIndex kGroupOffset = kGroupId * kSizePerWG;
+    const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
+                             triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
+                             triple_dim.K - kGroupOffset >= kSizePerWG;
+    // this is used to adjust the last block
+    StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
+    // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
+    // tile
+    kGroupOffset += kSize;
+
+    auto thread_properties =
+        ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
+                                       mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
+
+    auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
+
+    (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
+                                    : compute_panel<false>(itemID, thread_properties, out_ptr);
+  }
+  // The compute block computes the contraction operation private block for each thread and store the resutl in the
+  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
+  // it only compute the block on each thread's private memory space
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
+                                                                    PacketReturnType *privateRes) const {
+    StorageIndex idx = 0;
+    EIGEN_CONSTEXPR StorageIndex lhs_stride =
+        contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
+      auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
+      StorageIndex lhs_index = 0;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+        PacketReturnType lhsPack{};
+        Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
+                                                                                             lhs_block_ptr + lhs_index);
+        privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
+
+        lhs_index += lhs_stride;
+        idx++;
+      }
+    }
+  }
+  // The store function write the computed contraction operation in the private memory of each thread to the global
+  // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
+  // class.
+  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
+                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
+    auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
+      return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
+    };
+    // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
+    // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
+    // WorkLoadPerThreadN slice of N
+    EIGEN_CONSTEXPR StorageIndex GlobalNStride =
+        contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
+      // output leading dimension
+      StorageIndex outputLD = 0;
+      // When local memory is used the PrivateNstride is always 1 because the coalesced access on N is loaded into Local
+      // memory and extracting from local to global is the same as no transposed version. However, when local memory is
+      // not used and RHS is transposed we packetize the load for RHS.
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
+        StorageIndex globalRow = mGlobalOffset;
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+          PacketReturnType privetOut = privateRes[wLPTM];
+          if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
+            // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
+            // StorageIndex Therefore it is always coalesced layout
+            write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
+          } else {
+            EIGEN_UNROLL_LOOP
+            for (StorageIndex mId = 0; mId < PacketSize; mId++) {
+              StorageIndex mOffset = globalRow + mId;
+              if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
+                out_ptr[mOffset + outputLD] =
+                    Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
+              }
+            }
+          }
+          globalRow += (PacketSize * Properties::LocalThreadSizeM);
+        }
+        outputLD += triple_dim.M;
+        privateRes += Properties::WorkLoadPerThreadM / PacketSize;
+      }
+      out_ptr += (GlobalNStride * outputLD);
+
+      nGlobalOffset += (PrivateNStride * GlobalNStride);
+    }
+  }
+  // when no local memory is used the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::no_local> extract_block(
+      const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
+      const StorageIndex &ncOffset, const StorageIndex cOffset) const {
+    EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
+        InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
+    EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
+        InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+    StorageIndex cIndex = cOffset;
+
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
+      StorageIndex ncIndex = ncOffset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
+        if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
+          auto val =
+              read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                   InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                data_source::private_mem>(val, private_ptr);
+        } else {
+          EIGEN_UNROLL_LOOP
+          for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+            const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+            const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+            OutScalar val =
+                (ncInd < NC && cInd < triple_dim.K)
+                    ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                          inpt, ncInd, cInd, ld)
+                    : OutScalar(0);
+            write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                  data_source::private_mem>(
+                val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                         ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
+          }
+        }
+
+        // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
+        // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
+        ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
+                      ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
+                      : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
+        private_ptr += InputBlockProperties::nc_stride;
+      }
+      // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
+      private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
+      cIndex += InputBlockProperties::c_stride;
+    }
+  }
+  template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
+      const StorageIndex &linearLocalThreadId) {
+    const StorageIndex localThreadNC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    const StorageIndex localThreadC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<db && ctp == contraction_type::local> sync_mem(
+      const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
+    db_offset = !db_offset;
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!db && ctp == contraction_type::local> sync_mem(
+      const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+
+  template <contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<ctp == contraction_type::no_local> sync_mem(
+      const cl::sycl::nd_item<1> &, bool &) noexcept {
+    return;
+  }
+
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<need_sync && ctp == contraction_type::no_local>
+  sync_thread(const cl::sycl::nd_item<1> &
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+                  itemID
+#endif
+              ) noexcept {
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+    itemID.barrier(cl::sycl::access::fence_spacce::local_space);
+#else
+    return;
+#endif
+  }
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<need_sync && ctp == contraction_type::local>
+  sync_thread(const cl::sycl::nd_item<1> &itemID) {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+  template <bool need_sync>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!need_sync> sync_thread(const cl::sycl::nd_item<1> &) {
+    return;
+  }
+
+  template <bool is_internal_block>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
+                                                                    ThreadProperties<StorageIndex> &thread_properties,
+                                                                    TiledMemory &tiled_input_block,
+                                                                    PacketReturnType *privateRes,
+                                                                    bool &db_offset) const {
+    // Tiling the Rhs block from global to local memory
+    extract_block<RHSBlockProperties, is_internal_block>(
+        rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
+        tiled_input_block.rhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    sync_thread<contraction_tp == contraction_type::no_local>(itemID);
+
+    // Tiling the Lhs block from global to local memory
+    extract_block<LHSBlockProperties, is_internal_block>(
+        lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
+        tiled_input_block.lhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    // itemID.barrier(cl::sycl::access::fence_space::local_space);
+    sync_thread<contraction_tp == contraction_type::local>(itemID);
+    // switch to compute mede
+    StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
+    StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
+    // Loop over the values of a single tile
+    for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
+      compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
+                             tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
+      lhs_offset += LSDL;
+      rhs_offset += LSDR;
+    }
+    // computing the K index for the next tile
+    thread_properties.kSize -= Properties::TileSizeDimK;
+    sync_mem(itemID, db_offset);
+  }
+
+  // when local memory is available the following compute_panel will be enabled
+  template <bool is_internal_block, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
+                                                           ThreadProperties<StorageIndex> &thread_properties,
+                                                           OutPtr out_ptr) const {
+    auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
+    // Allocate register space
+    PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
+        PacketReturnType{0}};
+    bool db_offset = 0;
+
+    while (thread_properties.kSize >= Properties::TileSizeDimK) {
+      compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+    if (thread_properties.kSize > 0) {
+      compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+
+    // Storing the final results in the output
+    store<is_internal_block,
+          contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
+        out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
+        thread_properties.nGlobalOffset);
+  }
+  // When local memory is available the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::local> extract_block(
+      const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex> &local_index,
+      const StorageIndex &ncOffset, const StorageIndex cOffset) const {
+    EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
+        InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
+    EIGEN_CONSTEXPR StorageIndex LoadPerThread =
+        InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
+    EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
+    static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
+                   (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
+                  " LocalOffset must be divisible by stride");
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+    StorageIndex localThreadNC = local_index.first;
+    StorageIndex localThreadC = local_index.second;
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
+      const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
+      const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
+      const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+      if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
+        auto val =
+            read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
+        write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+            val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                     (InputBlockProperties::c_stride * localThreadC * LSD));
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+          const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+          OutScalar val =
+              (nCInd < NC && cInd < triple_dim.K)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, nCInd, cInd, ld)
+                  : OutScalar(0);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+              val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                       (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                       ((InputBlockProperties::c_stride * localThreadC +
+                         (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
+                        LSD));
+        }
+      }
+      localThreadNC += (InputBlockProperties::is_coalesced_layout)
+                           ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
+                           : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+      localThreadC += (InputBlockProperties::is_coalesced_layout)
+                          ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
+                          : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    }
+  }
+};
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+
+/*!
+ * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special
+ * case of Tensor Tensor contraction.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs)
+ *
+ * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs)
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam KFactor: determines the number of elements in K dimension in a Tile
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector
+ *
+ * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output.
+ * Otherwise, the result of contraction will be written iin a temporary buffer.
+ *
+ * \param scratch: determines the local memory containing the vector block for each work-group
+ *
+ * \param vec: determines the vector input (tensor mapper)
+ *
+ * \param mat: determines the tensor input (tensor mapper)
+ *
+ * \param out_res: determines the output vector containing the contraction result
+ *
+ * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension
+ *
+ * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ * \param contractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ */
+template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
+          typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
+struct GeneralVectorTensor {
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static EIGEN_CONSTEXPR int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+
+  static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
+      KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+
+  // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
+  // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
+  typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
+      VecBlockProperties;
+
+  Scratch scratch;
+  const VectorMapper vec;
+  const TensorMapper mat;
+  OutAccessor out_res;
+  const StorageIndex nonContractGroupSize;
+  const StorageIndex nonContractDim;
+  const StorageIndex contractDim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
+                                                            const TensorMapper mat_, OutAccessor out_res_,
+                                                            const StorageIndex nonContractGroupSize_,
+                                                            const StorageIndex nonContractDim_,
+                                                            const StorageIndex contractDim_)
+      : scratch(scratch_),
+        vec(vec_),
+        mat(mat_),
+        out_res(out_res_),
+        nonContractGroupSize(nonContractGroupSize_),
+        nonContractDim(nonContractDim_),
+        contractDim(contractDim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    auto scratch_ptr = scratch.get_pointer();
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
+                                            : linearLocalThreadId % Properties::LocalThreadSizeNC;
+    StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
+                                         : linearLocalThreadId / Properties::LocalThreadSizeNC;
+    const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
+    const StorageIndex nonContractGroupId =
+        is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
+    const StorageIndex contractGroupId =
+        is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
+    auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim);
+
+    const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
+    const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
+    auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
+    const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
+    auto local_output = scratch_ptr + OutScratchOffset;
+    const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
+                             contractDim - contractGroupOffset >= Properties::TileSizeDimC;
+    is_internal
+        ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                              scratch_ptr, contractGroupOffset,
+#endif
+                              nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                              nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
+        : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                               scratch_ptr, contractGroupOffset,
+#endif
+                               nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                               nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
+  }
+  template <bool is_internal_block, typename OutPtr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
+      const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
+      OutPtr out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
+#endif
+      const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
+      StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
+      StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
+    OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
+    // Reading the vector
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
+    extract_block<VecBlockProperties, is_internal_block, KFactor,
+                  Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
+                                                                                vectorOffset, contractDim);
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    auto in_scratch_ptr = scratch_ptr + contractId;
+#endif
+
+    StorageIndex privateOffsetC = 0;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
+      StorageIndex privateOffsetNC = 0;
+      bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      auto vecScalar = *in_scratch_ptr;
+#else
+      auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
+                           ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
+                                 is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
+                           : OutScalar(0);
+#endif
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        auto matScalar = (check_boundary<is_internal_block>(
+                             contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
+                             ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
+                                              : globalNonContractDimOffset + privateOffsetNC,
+                                   is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
+                                              : globalContractDimOffset + privateOffsetC)
+                             : OutScalar(0);
+
+        outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]);
+        privateOffsetNC += Properties::LocalThreadSizeNC;
+      }
+      privateOffsetC += Properties::LocalThreadSizeC;
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      in_scratch_ptr += Properties::LocalThreadSizeC;
+#endif
+    }
+
+    auto out_scratch_ptr = local_output + outScratchIndex;
+    // Each block of 16*16 element in shared memory should reduce to 16*1
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      *out_scratch_ptr = outScalar[j];
+
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+    if (is_lhs_vec) {
+      nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
+      contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
+      outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    }
+
+    out_scratch_ptr = local_output + outScratchIndex;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (contractId < offset) {
+          StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
+          *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
+        }
+      }
+      // moving to next 16 by 16 block
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+
+    if (contractId == 0) {
+      out_scratch_ptr = local_output + nonContractId;
+      StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
+      out_ptr += global_final_offset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
+          auto res = *out_scratch_ptr;
+
+          *out_ptr = res;
+          out_ptr += Properties::LocalThreadSizeNC;
+        }
+        // moving to next 16 by 16 block to ge the next 16 reduced elements
+        out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+        if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
+      }
+    }
+  }
+
+  template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
+            typename Local>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
+                                                                  const StorageIndex &linearLocalThreadId,
+                                                                  const StorageIndex &cOffset, const StorageIndex &C) {
+    local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
+    StorageIndex cIndex = cOffset;
+    for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
+      if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
+        auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                        InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
+                                                                                              cIndex, StorageIndex(1));
+        write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          OutScalar val =
+              (cIndex + i < C)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
+                  : OutScalar(0);
+          write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
+        }
+      }
+      local_ptr += InputBlockProperties::c_stride * GroupSize;
+      cIndex += InputBlockProperties::c_stride * GroupSize;
+    }
+  }
+};
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+
+/*!
+ * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction
+ * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param rng: determines the total input data size
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, bool Vectorizable>
+struct GeneralScalarContraction {
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex rng;
+
+  EIGEN_DEVICE_FUNC GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_,
+                                             OutAccessor out_res_, const StorageIndex rng_)
+      : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
+
+  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
+    auto out_ptr = out_res;
+    OutScalar *scratch_ptr = scratch.get_pointer();
+
+    StorageIndex globalid = itemID.get_global_id(0);
+    StorageIndex localid = itemID.get_local_id(0);
+    OutScalar accumulator = OutScalar(0);
+    for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator);
+    }
+    auto out_scratch_ptr = scratch_ptr + localid;
+    *out_scratch_ptr = accumulator;
+    for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
+      }
+    }
+    if (localid == 0) {
+      out_ptr[itemID.get_group(0)] = accumulator;
+    }
+  }
+};
+#endif
+
+}  // namespace internal
+}  // namespace TensorSycl
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
+                       Eigen::SyclDevice>
+    : public TensorContractionEvaluatorBase<TensorEvaluator<
+          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
+  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
+                "SYCL tensor contraction does not support output kernels.");
+
+  typedef Eigen::SyclDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::Index StorageIndex;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Base::Storage Storage;
+  typedef typename Base::EvaluatorPointerType EvaluatorPointerType;
+  struct TripleDim {
+    const StorageIndex M;
+    const StorageIndex N;
+    const StorageIndex K;
+    TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
+  };
+  enum {
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+  };
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+  static constexpr int LDims = Base::LDims;
+  static constexpr int RDims = Base::RDims;
+  static constexpr int ContractDims = Base::ContractDims;
+
+  typedef array<StorageIndex, LDims> left_dim_mapper_t;
+  typedef array<StorageIndex, RDims> right_dim_mapper_t;
+
+  typedef array<StorageIndex, ContractDims> contract_t;
+  typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
+  typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
+
+  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+  typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;
+  typedef std::remove_const_t<typename LeftEvaluator::CoeffReturnType> LhsScalar;
+  typedef std::remove_const_t<typename RightEvaluator::CoeffReturnType> RhsScalar;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
+  struct input_mapper_propertis {
+    static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
+    static EIGEN_CONSTEXPR bool is_rhs_matrix =
+        (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
+  };
+
+  TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (!data) {
+      this->m_result = this->m_device.get(
+          static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
+      data = this->m_result;
+    }
+    evalToSycl(data);
+    return (this->m_result != NULL);
+  }
+  const Eigen::SyclDevice &device() const { return this->m_device; }
+  void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    } else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(typename Base::EvaluatorPointerType buffer) const {
+    const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
+        PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
+                                                   right_nocontract_t, contract_t,
+                                                   PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned, MakePointer>
+        RhsMapper;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+    if (triple_dim.M == 1 && triple_dim.N == 1) {
+      launchSC(buffer, lhs, rhs, triple_dim.K);
+    } else
+#endif
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+        if (triple_dim.M != 1 && triple_dim.N == 1) {
+      LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
+    } else if (triple_dim.M == 1 && triple_dim.N != 1) {
+      LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
+    } else  // This is equivalent of if (m!=1 && n!=1)
+#endif
+    {
+      typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
+          inpt_mapper_properties;
+#ifndef EIGEN_SYCL_DISABLE_SKINNY
+      bool skinny = false;
+      auto platform_name = this->device().getPlatformName();
+      // This is based on empirical calculation for AMD r9-nano and Fiji
+      if (platform_name.find("AMD") == 0) {
+        skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
+                 ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
+                  (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
+      } else {
+        skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
+                  ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
+                  ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
+      }
+      if (skinny)
+        adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+      else
+#endif  // EIGEN_SYCL_DISABLE_SKINNY
+        adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+    }
+  }
+
+  template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
+  void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    const TripleDim &triple_dim) const {
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    if (device().has_local_memory()) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
+    if (!(device().has_local_memory())) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+  }
+
+  template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
+            typename Properties, typename LhsMapper, typename RhsMapper>
+  void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                const TripleDim &triple_dim) const {
+    const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
+    const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
+    const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
+    const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
+
+    const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
+    StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
+    StorageIndex groupSizeK =
+        skinny
+            ? std::max(std::min(totalTilesK,
+                                (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
+                                    (groupSizeM * groupSizeN)),
+                       StorageIndex(1))
+            : StorageIndex(1);
+
+    const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
+
+    const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
+
+    const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+    const StorageIndex globalRange = totalGroupSize * localRange;
+
+    const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)
+                                         ? ((Properties::DoubleBuffer + 1) *
+                                            (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
+                                               ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
+                                                (Properties::TileSizeDimN + Properties::BC))
+                                         : StorageIndex(1);
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (groupSizeK == 1) {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, true, ct>
+          ContractKernelName;
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+              lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim)
+          .wait();
+    } else {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, false, ct>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+              lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
+              triple_dim)
+          .wait();
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      auto op = Op();
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device()
+          .template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+              tmp_global_accessor, buffer,
+              cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
+                                        Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
+                                    cl::sycl::range<1>(localRange)),
+              StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK)
+          .wait();
+      device().deallocate_temp(temp_pointer);
+    }
+  }
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
+  void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
+                                    StorageIndex NC, StorageIndex C) const {
+    const StorageIndex nonContractDim = NC;
+    EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
+    EIGEN_CONSTEXPR StorageIndex CFactor = 1;
+    EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
+    typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
+        Properties;
+    const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
+    const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
+    const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
+    const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
+    const StorageIndex globalRange =
+        (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
+    const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
+    const StorageIndex scratchSize =
+        (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (cNumGroups > 1) {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, false>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+              vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C)
+          .wait();
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device()
+          .template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+              tmp_global_accessor, buffer,
+              cl::sycl::nd_range<1>(
+                  cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
+                  cl::sycl::range<1>(localRange)),
+              StorageIndex(1), Op(), nonContractDim, cNumGroups)
+          .wait();
+      device().deallocate_temp(temp_pointer);
+    } else {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, true>
+          ContractKernelName;
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+              vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C)
+          .wait();
+    }
+  }
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+  template <typename LhsMapper, typename RhsMapper>
+  EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    StorageIndex K) const {
+    EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                          (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                        "The Local thread size must be a power of 2 for the reduction "
+                        "operation");
+    EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
+    // reduces at least 512 elementss individually, we get better performance.
+    const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
+    const StorageIndex global_range = num_work_group * local_range;
+
+    typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
+        CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
+        ContractKernelName;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
+                                                                                thread_range, local_range, K)
+          .wait();
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, StorageIndex, local_range>
+          GenericRKernel;
+      device()
+          .template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+              tmp_global_accessor, buffer,
+              cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range,
+              Op())
+          .wait();
+      device().deallocate_temp(temp_pointer);
+    } else {
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
+                                                                                local_range, K)
+          .wait();
+    }
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    this->m_leftImpl.cleanup();
+    this->m_rightImpl.cleanup();
+
+    if (this->m_result) {
+      this->m_device.deallocate_temp(this->m_result);
+      this->m_result = NULL;
+    }
+  }
+};
+}  // namespace Eigen
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
new file mode 100644
index 00000000000..288d79f1f2a
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -0,0 +1,1552 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+
+// evaluator for thread pool device
+#ifdef EIGEN_USE_THREADS
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
+                       ThreadPoolDevice>
+    : public TensorContractionEvaluatorBase<TensorEvaluator<
+          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice>> {
+  typedef ThreadPoolDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>
+      EvalLeftArgType;
+  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>
+      EvalRightArgType;
+
+  static constexpr int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static constexpr int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static constexpr int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
+  typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  template <int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    evalProductImpl<NoCallback, Alignment>(buffer, NoCallback());
+  }
+
+  template <typename EvalToCallback, int Alignment>
+  void evalProductAsync(Scalar* buffer, EvalToCallback done) const {
+    evalProductImpl<EvalToCallback, Alignment>(buffer, std::move(done));
+  }
+
+  template <typename DoneCallback, int Alignment>
+  void evalProductImpl(Scalar* buffer, DoneCallback done) const {
+    // This function computes a lot of heuristics in multiple steps, and it
+    // also has multiple exit points. To keep it sane, readable and all in one
+    // place, sync/async execution decision is made at runtime at the very end.
+    //
+    // (1) In sync mode we allocate Context on the stack, submit computations
+    //     to the device thread pool, and block on a barrier until it is
+    //     completed.
+    //
+    // (2) In async mode we allocate Context on the heap, and after all tasks
+    //     are finished, we call provided the done callback, and delete a
+    //     context from the heap.
+    //
+    // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state
+    // and temporary buffers, required for executing the tensor contraction.
+    // They are responsible for cleaning it up after contraction is done.
+    static const bool IsEvalInSyncMode = std::is_same<DoneCallback, NoCallback>::value;
+
+    const Index m = this->m_i_size;
+    const Index n = this->m_j_size;
+    const Index k = this->m_k_size;
+    if (m == 0 || n == 0 || k == 0) return;
+
+    // Compute a set of algorithm parameters:
+    // - kernel block sizes (bm, bn, bk)
+    // - task grain sizes (number of kernels executed per task: gm, gn)
+    // - number of threads
+    // - sharding by row/column
+    // - parallel packing or first lhs then rhs
+    // and some derived parameters:
+    // - number of tasks (nm, nn, nk)
+    // - number of kernels (nm0, nn0)
+    // Unfortunately, all these parameters are tightly interdependent.
+    // So in some cases we first compute approximate values, then compute other
+    // values based on these approximations and then refine the approximations.
+
+    // There are lots of heuristics here. There is some reasoning behind them,
+    // but ultimately they are just tuned on contraction benchmarks for
+    // different input configurations, thread counts and instruction sets.
+    // So feel free to question any of them.
+
+    // Compute whether we want to shard by row or by column.
+    // This is a first approximation, it will be refined later. Since we don't
+    // know number of threads yet we use 2, because what's we are most
+    // interested in at this point is whether it makes sense to use
+    // parallelization at all or not.
+    bool shard_by_col = shardByCol(m, n, 2);
+
+    // First approximation of kernel blocking sizes.
+    // Again, we don't know number of threads yet, so we use 2.
+    Index bm, bn, bk;
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n,
+                                                                                                              2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByRow> blocking(k, m, n,
+                                                                                                              2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
+
+    // Compute optimal number of threads.
+    // Note: we use bk instead of k here because we are interested in amount of
+    // _parallelizable_ computations, and computations are not parallelizable
+    // across k dimension.
+    const TensorOpCost cost = contractionCost(m, n, bm, bn, bk, shard_by_col, false);
+    int num_threads =
+        TensorCostModel<ThreadPoolDevice>::numThreads(static_cast<double>(n) * m, cost, this->m_device.numThreads());
+    int num_threads_by_k = numThreadsInnerDim(m, n, k);
+    if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) {
+      // We are in the scenario where it is more effective to shard by the
+      // inner dimension.
+      if (IsEvalInSyncMode) {
+        EvalShardedByInnerDimContext<DoneCallback> ctx(this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx.template run<Alignment>();
+      } else {
+        auto* ctx =
+            new EvalShardedByInnerDimContext<DoneCallback>(this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx->template runAsync<Alignment>();
+      }
+
+      return;
+    }
+
+    // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
+    // model is not tuned. Remove this when the cost model is tuned.
+    if (n == 1) num_threads = 1;
+
+    if (num_threads == 1) {
+      TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Unaligned, (buffer));
+      if (!IsEvalInSyncMode) done();
+      return;
+    }
+
+    // Now that we know number of threads, recalculate sharding and blocking.
+    shard_by_col = shardByCol(m, n, num_threads);
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(
+          k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByRow> blocking(
+          k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
+
+    // Number of kernels for each dimension.
+    Index nm0 = numext::div_ceil(m, bm);
+    Index nn0 = numext::div_ceil(n, bn);
+    Index nk = numext::div_ceil(k, bk);
+
+    // Calculate task grain size (number of kernels executed per task).
+    // This task size coarsening serves two purposes:
+    // 1. It reduces per-task overheads including synchronization overheads.
+    // 2. It allows to use caches better (reuse the same packed rhs in several
+    // consecutive kernels).
+    Index gm = 1;
+    Index gn = 1;
+    // If we are sharding by column, then we prefer to reduce rows first.
+    if (shard_by_col) {
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+    } else {
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+    }
+    // Number of tasks in each dimension.
+    Index nm = numext::div_ceil(nm0, gm);
+    Index nn = numext::div_ceil(nn0, gn);
+
+    // If there is enough concurrency in the sharding dimension, we choose not
+    // to paralellize by the other dimension, and execute all kernels in sync
+    // mode. This reduces parallelism from the nm x nn down to nn
+    // (shard_by_col==true) or nm (shard_by_col==false).
+    const Index sharding_dim_tasks = shard_by_col ? nn : nm;
+    const int num_worker_threads = this->m_device.numThreadsInPool();
+
+    // With small number of threads we want to make sure that we do not reduce
+    // parallelism too much. With large number of threads we trade maximum
+    // parallelism for better memory locality.
+    const float oversharding_factor = num_worker_threads <= 4    ? 8.0
+                                      : num_worker_threads <= 8  ? 4.0
+                                      : num_worker_threads <= 16 ? 2.0
+                                      : num_worker_threads <= 32 ? 1.0
+                                      : num_worker_threads <= 64 ? 0.8
+                                                                 : /* num_worker_threads > 64 */ 0.6;
+
+    const bool parallelize_by_sharding_dim_only = sharding_dim_tasks >= oversharding_factor * num_worker_threads;
+
+    // Last by not least, decide whether we want to issue both lhs and rhs
+    // packing in parallel; or issue lhs packing first, and then issue rhs
+    // packing when lhs packing completes (for !shard_by_col lhs and rhs are
+    // swapped). Parallel packing allows more parallelism (for both packing and
+    // kernels), while sequential packing provides better locality (once
+    // a thread finishes rhs packing it proceed to kernels with that rhs).
+    // First, we are interested in parallel packing if there are few tasks.
+    bool parallel_pack = num_threads >= nm * nn;
+    // Also do parallel packing if all data fits into L2$.
+    if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= l2CacheSize() * num_threads)
+      parallel_pack = true;
+    // But don't do it if we will use each rhs only once. Locality seems to be
+    // more important in this case.
+    if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+    // Also don't get in the way of parallelize_by_sharding_dim_only
+    // optimization.
+    if (parallelize_by_sharding_dim_only) parallel_pack = false;
+
+    // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext.
+    if (IsEvalInSyncMode) {
+#define CONTEXT_ARGS                                                                                          \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack, \
+   parallelize_by_sharding_dim_only, NoCallback())                                                            \
+      .run()
+      TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment, CONTEXT_ARGS);
+#undef CONTEXT_ARGS
+
+    } else {
+#define CONTEXT_ARGS                                                                                          \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack, \
+   parallelize_by_sharding_dim_only, std::move(done))
+      TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback, Alignment, CONTEXT_ARGS, run());
+#undef CONTEXT_ARGS
+    }
+  }
+
+  // ------------------------------------------------------------------------ //
+
+  // Dummy struct to represent an empty DoneCallback.
+
+  struct NoCallback {
+    void operator()() { eigen_assert(false && "NoCallback should never be called"); }
+  };
+
+  // ------------------------------------------------------------------------ //
+
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification;
+
+  // Synchronous evaluation notification that blocks caller thread in Wait().
+  template <typename Context>
+  class EvalParallelNotification<NoCallback, Context> {
+   public:
+    EvalParallelNotification(Context*, NoCallback) {}
+    void Notify() { done_.Notify(); }
+    void Wait() { done_.Wait(); }
+
+   private:
+    Eigen::Notification done_;
+  };
+
+  // Asynchronous evaluation notification that does not block in Wait().
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification {
+   public:
+    EvalParallelNotification(Context* ctx, DoneCallback done) : ctx_(ctx), done_(std::move(done)) {}
+
+    void Notify() {
+      // Make a copy of done callback, because it will be destructed when we
+      // will delete context in the next line (EvalParallelNotification is a
+      // data member of EvalParallelContext class).
+      DoneCallback done_copy = std::move(done_);
+
+      // Delete parallel evaluation context.
+      delete ctx_;
+
+      // Now safely call the done callback.
+      done_copy();
+    }
+
+    void Wait() {}
+
+   private:
+    Context* ctx_;
+    DoneCallback done_;
+  };
+
+  // Context orchestrates sync/async parallel contraction evaluation. When it is
+  // executed in asynchronous mode, it owns all the shared state that might be
+  // accessible by block packing and kernel tasks.
+
+  template <typename DoneCallback, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  class EvalParallelContext {
+   public:
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+                                                   contract_t, internal::packet_traits<LhsScalar>::size,
+                                                   lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                                   contract_t, internal::packet_traits<RhsScalar>::size,
+                                                   rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    typedef internal::TensorContractionKernel<Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
+
+    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+
+    EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
+                        Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0,
+                        bool shard_by_col, bool parallel_pack, bool parallelize_by_sharding_dim_only, DoneCallback done)
+        : created_by_thread_id_(std::this_thread::get_id()),
+          done_(this, std::move(done)),
+          device_(self->m_device),
+          lhs_(self->m_leftImpl, self->m_left_nocontract_strides, self->m_i_strides, self->m_left_contracting_strides,
+               self->m_k_strides),
+          rhs_(self->m_rightImpl, self->m_right_nocontract_strides, self->m_j_strides,
+               self->m_right_contracting_strides, self->m_k_strides),
+          buffer_(buffer),
+          output_(buffer, tm),
+          output_kernel_(self->m_output_kernel),
+          tensor_contraction_params_(self->m_tensor_contraction_params),
+          num_threads_(num_threads),
+          shard_by_col_(shard_by_col),
+          parallel_pack_(parallel_pack),
+          parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only),
+          m_(tm),
+          n_(tn),
+          k_(tk),
+          bm_(bm),
+          bn_(bn),
+          bk_(bk),
+          nm_(nm),
+          nn_(nn),
+          nk_(nk),
+          gm_(gm),
+          gn_(gn),
+          nm0_(nm0),
+          nn0_(nn0),
+          kernel_(m_, k_, n_, bm_, bk_, bn_),
+          num_thread_local_allocations_(0),
+          // We reserve 2X more capacity for a thread local values, than the
+          // number of threads in the pool to efficiently handle task stealing
+          // by threads that are not managed by the pool.
+          thread_local_capacity(2 * (parallelize_by_sharding_dim_only_ ? device_.numThreadsInPool() : 0)),
+          // We will use only one of the Lhs/Rhs thread local storage depending
+          // on the shard_by_col value and we parallelize by sharding dim ONLY.
+          lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity, {*this}, {*this}),
+          rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0, {*this}, {*this}) {
+      // These two options are mutually exclusive.
+      eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only));
+
+      for (Index x = 0; x < P; x++) {
+        // Normal number of notifications for k slice switch is
+        // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
+        // nm_ + nn_ notifications, because they will not receive notifications
+        // from preceding kernels.
+        state_switch_[x] =
+            x == 0 ? 1 : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + (x == P - 1 ? nm_ * nn_ : 0);
+        state_packing_ready_[x] = parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
+        state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
+        for (Index m = 0; m < nm_; m++) {
+          state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
+          // Kernels generally receive 3 notifications (previous kernel + 2
+          // packing), but the first slice won't get notifications from previous
+          // kernels.
+          for (Index n = 0; n < nn_; n++)
+            state_kernel_[x][m][n].store((x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), std::memory_order_relaxed);
+        }
+      }
+
+      // Allocate memory for packed rhs/lhs matrices.
+      packed_mem_ = kernel_.allocateSlices(            //
+          device_,                                     //
+          /*num_lhs=*/nm0_,                            //
+          /*num_rhs=*/nn0_,                            //
+          /*num_slices=*/std::min<Index>(nk_, P - 1),  //
+          packed_lhs_, packed_rhs_);
+
+      if (parallelize_by_sharding_dim_only_) {
+        const int num_worker_threads = device_.numThreadsInPool();
+
+        if (shard_by_col) {
+          can_use_thread_local_packed_ = new std::atomic<bool>[nn_];
+          for (int i = 0; i < nn_; ++i) can_use_thread_local_packed_[i].store(true, std::memory_order_relaxed);
+
+          Index num_blocks = num_worker_threads * gn_;
+          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
+              device_,                                              //
+              /*num_lhs=*/0,                                        //
+              /*num_rhs=*/num_blocks,                               //
+              /*num_slices=*/1,                                     //
+              /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_);
+
+        } else {
+          can_use_thread_local_packed_ = new std::atomic<bool>[nm_];
+          for (int i = 0; i < nm_; ++i) can_use_thread_local_packed_[i].store(true, std::memory_order_relaxed);
+
+          Index num_blocks = num_worker_threads * gm_;
+          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
+              device_,                                              //
+              /*num_lhs=*/num_blocks,                               //
+              /*num_rhs=*/0,                                        //
+              /*num_slices=*/1, &lhs_thread_local_pre_allocated_,   //
+              /*rhs_blocks=*/nullptr);
+        }
+      }
+    }
+
+    ~EvalParallelContext() {
+      for (Index x = 0; x < P; x++) {
+        for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
+        delete[] state_kernel_[x];
+      }
+      kernel_.deallocate(device_, packed_mem_);
+      if (parallelize_by_sharding_dim_only_) {
+        kernel_.deallocate(device_, thread_local_pre_alocated_mem_);
+        delete[] can_use_thread_local_packed_;
+      }
+    }
+
+    void run() {
+      // Kick off packing of the first slice.
+      signal_switch(0, 1);
+
+      // Wait for overall completion.
+      //
+      // If parallel evaluation is executed in async mode, this is a no-op, and
+      // Wait() will return immediately. In synchronous mode it will block the
+      // caller thread until it will receive notification from last task.
+      //
+      // In async mode, last task when completed will call done callback from
+      // the same thread, and will delete this context.
+      //
+      // TODO(dvyukov): This wait can lead to deadlock if contraction is
+      // evaluated in synchronous mode. If nthreads contractions are
+      // concurrently submitted from worker threads, this wait will block all
+      // worker threads and the system will deadlock.
+      done_.Wait();
+    }
+
+   private:
+    std::thread::id created_by_thread_id_;
+
+    // This notification is specialized on the type of DoneCallback and can be
+    // blocking or non-blocking.
+    EvalParallelNotification<DoneCallback, EvalParallelContext> done_;
+
+    const Device& device_;
+    LhsMapper lhs_;
+    RhsMapper rhs_;
+    Scalar* const buffer_;
+    OutputMapper output_;
+    OutputKernelType output_kernel_;
+    TensorContractionParams tensor_contraction_params_;
+    const int num_threads_;
+    const bool shard_by_col_;
+    const bool parallel_pack_;
+    const bool parallelize_by_sharding_dim_only_;
+    // Matrix sizes.
+    const Index m_;
+    const Index n_;
+    const Index k_;
+    // Block sizes.
+    const Index bm_;
+    const Index bn_;
+    const Index bk_;
+    // Number of tasks.
+    const Index nm_;
+    const Index nn_;
+    const Index nk_;
+    // Task grain sizes (number of kernels executed per task).
+    const Index gm_;
+    const Index gn_;
+    // Number of blocks (this is different from ni_/nn_ because of task size
+    // coarsening).
+    const Index nm0_;
+    const Index nn0_;
+    // Tensor contraction kernel.
+    TensorContractionKernel kernel_;
+
+    // Parallelization strategy.
+    //
+    // Blocks related to the same k block can run in parallel because they write
+    // to different output blocks. So we parallelize within k slices, this
+    // gives us parallelism level of m x n. Before we can start any kernels
+    // related to k-th slice, we need to issue m lhs packing tasks and n rhs
+    // packing tasks.
+    //
+    // However, there is a bottleneck when we are finishing kernels for k-th
+    // slice (at the very end there is only 1 runnable kernel). To mitigate this
+    // bottleneck we allow kernels from k-th and k+1-th slices to run in
+    // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
+    // output block, so they must not run in parallel.
+    //
+    // This gives us the following dependency graph.
+    // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
+    // packing tasks.
+    // Kernel (m, n, k) can start when:
+    //  - kernel (m, n, k-1) has finished
+    //  - lhs packing (m, k) has finished
+    //  - rhs packing (n, k) has finished
+    // Lhs/rhs packing can start when:
+    //  - all k-1 packing has finished (artificially imposed to limit amount of
+    //  parallel packing)
+    //
+    // On top of that we limit runnable tasks to two consecutive k slices.
+    // This is done to limit amount of memory we need for packed lhs/rhs
+    // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
+    //
+    // state_switch_ tracks when we are ready to switch to the next k slice.
+    // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
+    // These variable are rolling over 3 consecutive k slices: first two we are
+    // actively executing + one to track completion of kernels in the second
+    // slice.
+    static constexpr Index P = 3;
+
+    // Handle to the allocated temporary storage for Lhs/Rhs blocks.
+    BlockMemHandle packed_mem_;
+    std::vector<LhsBlock> packed_lhs_[P - 1];
+    std::vector<RhsBlock> packed_rhs_[P - 1];
+
+    // If we choose to parallelize only by the sharding dimension, each thread
+    // will have it's own "thead local" (not a c++ thread local storage) memory
+    // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory
+    // can't be passed to a kernel that might execute on a different thread.
+    //
+    // In practice when we are ready to pack memory for the sharding dimension
+    // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice
+    // already computed (99% of the time), and we can pack data into the thread
+    // local storage, and guarantee that all the kernels will be executed
+    // immediately in the same thread. This significantly increases L1 cache hit
+    // ratio and reduces pressure on the memory bus.
+    //
+    // It's still possible that kernel for the K-th slice will be ready before
+    // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_
+    // and packed_rhs_ to allow kernels to be executed later on a thread
+    // different from the thread that was used for packing.
+
+    // Handle for pre-allocated thread local memory buffers.
+    BlockMemHandle thread_local_pre_alocated_mem_;
+
+    // Only one of these will be initialized depending on shard_by_col value
+    // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`).
+    std::vector<LhsBlock> lhs_thread_local_pre_allocated_;
+    std::vector<RhsBlock> rhs_thread_local_pre_allocated_;
+
+    // How many thread local blocks were already allocated.
+    std::atomic<int> num_thread_local_allocations_;
+    const int thread_local_capacity;
+
+    // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of
+    // unique threads in a system is below or equal to the number of threads in
+    // a thread pool. We will fallback on dynamic memory allocation after that.
+
+    // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its
+    // size is equal to the grain size in Lhs/Rhs sharding dimension.
+    template <typename BlockType>
+    class ThreadLocalBlocks {
+     public:
+      ThreadLocalBlocks() = default;
+
+      ThreadLocalBlocks(BlockType* base, size_t grain_size)
+          : is_pre_allocated_(true), thread_local_pre_allocated_base_(base), grain_size_(grain_size) {}
+
+      ThreadLocalBlocks(BlockMemHandle mem_handle, std::vector<BlockType> blocks)
+          : is_pre_allocated_(false), mem_handle_(std::move(mem_handle)), blocks_(std::move(blocks)) {}
+
+      BlockType& block(int grain_index) {
+        eigen_assert(grain_index >= 0);
+        eigen_assert(static_cast<size_t>(grain_index) < size());
+        return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index] : blocks_[grain_index];
+      }
+
+      void Release(EvalParallelContext& ctx) const {
+        if (!is_pre_allocated_) {
+          ctx.kernel_.deallocate(ctx.device_, mem_handle_);
+        }
+      }
+
+      size_t size() const { return is_pre_allocated_ ? grain_size_ : blocks_.size(); }
+
+     private:
+      bool is_pre_allocated_;
+
+      // Reuse pre-allocated thread local buffers.
+      BlockType* thread_local_pre_allocated_base_ = nullptr;
+      size_t grain_size_ = 0;
+
+      // These will be initialized only if `is_pre_allocated == false`.
+      BlockMemHandle mem_handle_{};
+      std::vector<BlockType> blocks_;
+    };
+
+    // ThreadLocalBlocksInitialize callable does custom thread local blocks
+    // initialization, and will reuse pre-allocated buffers if possible, or will
+    // dynamically allocate new memory.
+    //
+    // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly
+    // for what side do we plan to do block allocation.
+    template <typename BlockType, bool is_rhs>
+    class ThreadLocalBlocksInitialize {
+      static constexpr bool kIsLhs = !is_rhs && std::is_same<BlockType, LhsBlock>::value;
+      static const bool kIsRhs = is_rhs && std::is_same<BlockType, RhsBlock>::value;
+      static_assert(kIsLhs || kIsRhs, "Unknown block type");
+
+      using Blocks = ThreadLocalBlocks<BlockType>;
+
+     public:
+      ThreadLocalBlocksInitialize(EvalParallelContext& ctx)
+          : ctx_(ctx), num_worker_threads_(ctx_.device_.numThreadsInPool()) {}
+
+      void operator()(Blocks& blocks) {
+        const int n = ctx_.num_thread_local_allocations_.fetch_add(1, std::memory_order_relaxed);
+
+        if (n >= num_worker_threads_) {
+          ThreadLocalBlocksAllocator<is_rhs>::allocate(ctx_, blocks);
+        } else {
+          ThreadLocalBlocksAllocator<is_rhs>::reuse(ctx_, n, blocks);
+        }
+      }
+
+     private:
+      // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to
+      // TensorContractionKernel::allocateSlices into template specializations.
+      // Also explicit specializations are not allowed at class scope in C++03,
+      // EvalCtx type parameter is just a workaround for that limitation.
+      template <bool pack_rhs, typename EvalCtx = EvalParallelContext>
+      struct ThreadLocalBlocksAllocator;
+
+      template <typename EvalCtx>
+      struct ThreadLocalBlocksAllocator</*pack_rhs=*/true, EvalCtx> {
+        static void allocate(EvalCtx& ctx, Blocks& blocks) {
+          std::vector<RhsBlock> rhs_blocks;
+          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(ctx.device_,
+                                                                 /*num_lhs=*/0,
+                                                                 /*num_rhs=*/ctx.gn_,
+                                                                 /*num_slices=*/1,
+                                                                 /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks);
+
+          blocks = ThreadLocalBlocks<RhsBlock>(std::move(mem_handle), std::move(rhs_blocks));
+        }
+
+        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+          RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index];
+          blocks = ThreadLocalBlocks<RhsBlock>(ptr, ctx.gn_);
+        }
+      };
+
+      template <typename EvalCtx>
+      struct ThreadLocalBlocksAllocator</*pack_rhs=*/false, EvalCtx> {
+        static void allocate(EvalCtx& ctx, Blocks& blocks) {
+          std::vector<LhsBlock> lhs_blocks;
+          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(ctx.device_,
+                                                                 /*num_lhs=*/ctx.gm_,
+                                                                 /*num_rhs=*/0,
+                                                                 /*num_slices=*/1,
+                                                                 /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr);
+
+          blocks = ThreadLocalBlocks<LhsBlock>(std::move(mem_handle), std::move(lhs_blocks));
+        }
+
+        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+          LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index];
+          blocks = ThreadLocalBlocks<LhsBlock>(ptr, ctx.gm_);
+        }
+      };
+
+      EvalParallelContext& ctx_;
+      const int num_worker_threads_;
+    };
+
+    template <typename BlockType>
+    class ThreadLocalBlocksRelease {
+     public:
+      using Blocks = ThreadLocalBlocks<BlockType>;
+      ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {}
+      void operator()(Blocks& blocks) { blocks.Release(ctx_); }
+
+     private:
+      EvalParallelContext& ctx_;
+    };
+
+    // ThreadLocalBlocks initialization callables.
+    using ThreadLocalLhsInit = ThreadLocalBlocksInitialize<LhsBlock, /*is_rhs=*/false>;
+    using ThreadLocalRhsInit = ThreadLocalBlocksInitialize<RhsBlock, /*is_rhs=*/true>;
+
+    // ThreadLocalBlocks release callables.
+    using ThreadLocalLhsRelease = ThreadLocalBlocksRelease<LhsBlock>;
+    using ThreadLocalRhsRelease = ThreadLocalBlocksRelease<RhsBlock>;
+
+    // Thread local containers for Lhs/Rhs block packs. In practice only one of
+    // them will be used, depending on the shard_by_col value.
+    Eigen::ThreadLocal<ThreadLocalBlocks<LhsBlock>, ThreadLocalLhsInit, ThreadLocalLhsRelease> lhs_thread_local_blocks_;
+    Eigen::ThreadLocal<ThreadLocalBlocks<RhsBlock>, ThreadLocalRhsInit, ThreadLocalRhsRelease> rhs_thread_local_blocks_;
+
+    // After a particular shard for Kth slice missed thread local execution
+    // opportunity (K-1 slice didn't complete kernels execution), we can no
+    // longer schedule K+1 and following slices in thread local mode, because
+    // there is no more guarantee that previous kernels were executed
+    // sequentially in the same thread (size is nn_ or nm_).
+    std::atomic<bool>* can_use_thread_local_packed_;
+
+    std::atomic<uint8_t>** state_kernel_[P];
+    // state_switch_ is frequently modified by worker threads, while other
+    // fields are read-only after constructor. Let's move it to a separate cache
+    // line to reduce cache-coherency traffic.
+    char pad_[128];
+    std::atomic<Index> state_packing_ready_[P];
+    std::atomic<Index> state_switch_[P];
+
+    LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) {
+      if (use_thread_local) {
+        eigen_assert(!shard_by_col_);
+        ThreadLocalBlocks<LhsBlock>& blocks = lhs_thread_local_blocks_.local();
+
+        Index grain_index = m1 - m * gm_;
+        return blocks.block(
+            internal::convert_index<int>(grain_index));  // FIXME better make ThreadLocalBlocks use Eigen::Index?
+      } else {
+        return packed_lhs_[k % (P - 1)][m1];
+      }
+    }
+
+    RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) {
+      if (use_thread_local) {
+        eigen_assert(shard_by_col_);
+        ThreadLocalBlocks<RhsBlock>& blocks = rhs_thread_local_blocks_.local();
+
+        Index grain_index = n1 - n * gn_;
+        return blocks.block(
+            internal::convert_index<int>(grain_index));  // FIXME better make ThreadLocalBlocks use Eigen::Index?
+      } else {
+        return packed_rhs_[k % (P - 1)][n1];
+      }
+    }
+
+    // In following two methods (pack_lhs and pack_rhs), if we know for sure
+    // that we'll be able to immediately call a kernel with packed data, and do
+    // not submit it to the thread pool, we can use thread local memory for
+    // packed data.
+    //
+    // We can only reliably check it if we are running all kernels in sync mode
+    // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to
+    // run, it's guaranteed that all kernels with larger values of m (n) are
+    // also ready, because we execute them in the same order for all K slices.
+
+    void pack_lhs(Index m, Index k) {
+      bool use_thread_local = false;
+
+      if (parallelize_by_sharding_dim_only_ && !shard_by_col_ &&
+          can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) {
+        if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) {
+          use_thread_local = true;
+        } else {
+          // If we can't guarantee that all kernels in `k` slice will be
+          // executed sequentially in current thread, it's no longer safe to use
+          // thread local memory in following slices along the k dimensions.
+          eigen_assert(k > 0);
+          can_use_thread_local_packed_[m].store(false, std::memory_order_relaxed);
+        }
+      }
+
+      const Index mend = m * gm_ + gm(m);
+      for (Index m1 = m * gm_; m1 < mend; m1++)
+        kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local), lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+
+      if (!parallel_pack_ && shard_by_col_) {
+        eigen_assert(!use_thread_local);
+        signal_packing(k);
+      } else {
+        signal_switch(k + 1);
+        for (Index n = nn_ - 1; n >= 0; n--) {
+          bool sync = parallelize_by_sharding_dim_only_ || n == 0;
+          signal_kernel(m, n, k, sync, use_thread_local);
+        }
+      }
+    }
+
+    void pack_rhs(Index n, Index k) {
+      bool use_thread_local = false;
+
+      if (parallelize_by_sharding_dim_only_ && shard_by_col_ &&
+          can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) {
+        if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) {
+          use_thread_local = true;
+        } else {
+          // If we can't guarantee that all kernels in `k` slice will be
+          // executed sequentially in current thread, it's no longer safe to use
+          // thread local memory in following slices along the k dimensions.
+          eigen_assert(k > 0);
+          can_use_thread_local_packed_[n].store(false, std::memory_order_relaxed);
+        }
+      }
+
+      const Index nend = n * gn_ + gn(n);
+      for (Index n1 = n * gn_; n1 < nend; n1++) {
+        if (!TensorContractionKernel::HasBeta && k == 0) {
+          // Zero the output memory in parallel, only if contraction kernel does
+          // not support `beta`. Otherwise we will pass beta 0.0 to the first
+          // call to the `TensorContractionKernel::invoke()`.
+          //
+          // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
+          // x m) row. Safe to do here because all kernels that will write to
+          // this memory depend on completion of this task. Note: don't call
+          // device_.fill() here. device_.fill() blocks on thread pool
+          // worker thread, which can lead to underutilization and deadlocks.
+          std::fill_n(buffer_ + n1 * bn_ * m_, bn(n1) * m_, Scalar(0));
+        }
+        kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local), rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+      }
+
+      if (parallel_pack_ || shard_by_col_) {
+        signal_switch(k + 1);
+        for (Index m = nm_ - 1; m >= 0; m--) {
+          bool sync = parallelize_by_sharding_dim_only_ || m == 0;
+          signal_kernel(m, n, k, sync, use_thread_local);
+        }
+      } else {
+        eigen_assert(!use_thread_local);
+        signal_packing(k);
+      }
+    }
+
+    void kernel(Index m, Index n, Index k, bool use_thread_local) {
+      // Note: order of iteration matters here. Iteration over m is innermost
+      // because we want to reuse the same packed rhs in consecutive tasks
+      // (rhs fits into L2$ while lhs only into L3$).
+      const Index nend = n * gn_ + gn(n);
+      const Index mend = m * gm_ + gm(m);
+
+      // NOTE: output = alpha * LHS * RHS + beta * output.
+      const Scalar alpha = Scalar(1);
+      const Scalar beta = (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1);
+
+      if (shard_by_col_) {
+        for (Index n1 = n * gn_; n1 < nend; n1++) {
+          for (Index m1 = m * gm_; m1 < mend; m1++) {
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            kernel_.invoke(output_mapper, packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+                           packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), bk(k), bn(n1), alpha, beta);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_, m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
+          }
+        }
+      } else {
+        for (Index m1 = m * gm_; m1 < mend; m1++)
+          for (Index n1 = n * gn_; n1 < nend; n1++) {
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            kernel_.invoke(output_mapper, packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+                           packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), bk(k), bn(n1), alpha, beta);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_, m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
+          }
+      }
+      signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false);
+      signal_switch(k + 2);
+    }
+
+    void signal_packing(Index k) {
+      eigen_assert(!parallel_pack_);
+      Index s = state_packing_ready_[k % P].fetch_sub(1);
+      eigen_assert(s > 0);
+      if (s != 1) return;
+      state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
+      enqueue_packing(k, shard_by_col_);
+    }
+
+    void signal_kernel(Index m, Index n, Index k, bool sync, bool use_thread_local) {
+      std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
+      Index s = state->load();
+      eigen_assert(s > 0);
+      if (s != 1 && state->fetch_sub(1) != 1) {
+        eigen_assert(!use_thread_local);
+        return;
+      }
+      state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
+      if (sync) {
+        kernel(m, n, k, use_thread_local);
+      } else {
+        eigen_assert(!use_thread_local);
+        device_.enqueueNoNotification([this, m, n, k, use_thread_local]() { 
+            kernel(m, n, k, use_thread_local); 
+          });
+      }
+    }
+
+    void signal_switch(Index k, Index v = 1) {
+      Index s = state_switch_[k % P].fetch_sub(v);
+      eigen_assert(s >= v);
+      if (s != v) return;
+
+      // Ready to switch to the next k slice.
+      // Reset counter for the next iteration.
+      state_switch_[k % P] = (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + nm_ * nn_;
+      if (k < nk_) {
+        // Issue lhs/rhs packing. Their completion will in turn kick off
+        // kernels.
+        if (parallel_pack_) {
+          enqueue_packing(k, !shard_by_col_);
+          enqueue_packing(k, shard_by_col_);
+        } else if (shard_by_col_) {
+          enqueue_packing(k, false);
+        } else {
+          enqueue_packing(k, true);
+        }
+
+        // Termination handling.
+        // Because kernel completion signals k + 2 switch, we need to finish nk
+        // + 2 slices without issuing any tasks on nk + 1 slice. So here we
+        // pretend that all nk + 1 packing tasks just finish instantly; so that
+        // nk + 2 switch only waits for completion of nk kernels.
+      } else if (k == nk_) {
+        signal_switch(k + 1, parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
+      } else {
+        done_.Notify();
+      }
+    }
+
+    // Enqueue all rhs/lhs packing for k-th slice.
+    void enqueue_packing(Index k, bool rhs) { enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); }
+
+    void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
+      if (end - start == 1) {
+        if (rhs)
+          pack_rhs(start, k);
+        else
+          pack_lhs(start, k);
+      } else {
+        while (end - start > 1) {
+          Index mid = (start + end) / 2;
+          device_.enqueueNoNotification([this, mid, end, k, rhs]() { 
+              enqueue_packing_helper(mid, end, k, rhs);
+            });
+          end = mid;
+        }
+
+        // Decide if we want to run first packing task (start == 0) in
+        // async mode if we parallelize only by sharding dim:
+        // (1) pack_lhs and pack_rhs call signal_switch before completing
+        //     all calls to signal_kernel, which in sync mode might lead
+        //     to the execution of the first kernel of the k+1 slice, before
+        //     completing a call to the last kernel of the k slice.
+        // (2) all pack tasks for sharded dim must be executed in a thread
+        //     pool to get pre-allocated thead local buffers.
+        bool pack_async = (start == 0) && (parallelize_by_sharding_dim_only_ && shard_by_col_ == rhs) &&
+                          (k > 0 || std::this_thread::get_id() == created_by_thread_id_);
+
+        if (pack_async) {
+          device_.enqueueNoNotification([this, start, end, k, rhs]() { 
+              enqueue_packing_helper(start, end, k, rhs);
+            });
+        } else {
+          enqueue_packing_helper(start, end, k, rhs);
+        }
+      }
+    }
+
+    // Block sizes with accounting for potentially incomplete last block.
+    Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
+    Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
+    Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
+    // Task grain sizes accounting for potentially incomplete last task.
+    Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
+    Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
+
+    EvalParallelContext(const EvalParallelContext&) = delete;
+    void operator=(const EvalParallelContext&) = delete;
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  using SyncEvalParallelContext = EvalParallelContext<NoCallback, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
+                                                      rhs_inner_dim_reordered, Alignment>;
+
+  // ------------------------------------------------------------------------ //
+
+  // EvalShardedByInnerDimContext orchestrates sync/async contraction
+  // evaluation, when we shard by inner dimension. When it is executed in
+  // asynchronous mode, it owns all the shared state that might be accessible by
+  // block processing tasks.
+
+  template <typename DoneCallback>
+  struct EvalShardedByInnerDimContext {
+    EvalShardedByInnerDimContext(const Self* self, int num_threads, Scalar* result_buffer, Index m_size, Index n_size,
+                                 Index k_size, DoneCallback done_callback)
+        : evaluator(self),
+          m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous),
+          m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous),
+          m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered),
+          result(result_buffer),
+          m(m_size),
+          n(n_size),
+          k(k_size),
+          done(std::move(done_callback)),
+          buffer_size_bytes(m * n * sizeof(Scalar)),
+          block_size(blockSize(k, num_threads)),
+          num_blocks(numext::div_ceil<Index>(k, block_size)),
+          num_pending_blocks(internal::convert_index<int>(num_blocks)),
+          l0_ranges(numext::div_ceil<Index>(num_blocks, l0_size)),
+          l0_state(l0_ranges),
+          block_buffers(num_blocks) {
+      // Keep count of pending gemm tasks for each l0 range.
+      for (int i = 0; i < l0_ranges; ++i) {
+        const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i);
+        l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
+      }
+
+      // Allocate temporary buffers for each block.
+      for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
+        Scalar* buf = block_idx == 0 ? result : static_cast<Scalar*>(evaluator->m_device.allocate(buffer_size_bytes));
+        block_buffers.emplace_back(buf);
+      }
+    }
+
+    ~EvalShardedByInnerDimContext() {
+      for (Index i = 1; i < num_blocks; ++i) {
+        evaluator->m_device.deallocate(block_buffers[i]);
+      }
+    }
+
+    template <int Alignment>
+    void run() {
+      Barrier barrier(internal::convert_index<int>(num_blocks));
+      eval<Alignment>(barrier, 0, num_blocks);
+      barrier.Wait();
+
+      // Aggregate partial sums from l0 ranges.
+      aggregateL0Blocks<Alignment>();
+
+      // Apply output kernel.
+      applyOutputKernel();
+    }
+
+    template <int Alignment>
+    void runAsync() {
+      evalAsync<Alignment>(0, num_blocks);
+    }
+
+   private:
+    // The underlying GEMM kernel assumes that k is a multiple of
+    // the packet size and subtle breakage occurs if this is violated.
+    static const Index packet_size = internal::packet_traits<RhsScalar>::size;
+
+    const Self* evaluator;  // TensorContraction evaluator
+
+    // These fields required fromTENSOR_CONTRACTION_DISPATCH macro.
+    bool m_lhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_reordered;
+
+    Scalar* result;
+
+    Index m;
+    Index n;
+    Index k;
+
+    DoneCallback done;
+
+    // ----------------------------------------------------------------------//
+    // Algorithm parameters.
+
+    // We will compute partial results into the buffers of this size.
+    Index buffer_size_bytes;
+
+    Index block_size;
+    Index num_blocks;
+
+    // Keep track of pending tasks when evaluate in async mode.
+    std::atomic<int> num_pending_blocks;
+
+    // We compute partial gemm results in parallel, and to get the final result
+    // we need to add them all together. For the large number of threads (>= 48)
+    // this adds a very expensive sequential step at the end.
+    //
+    // We split the [0, num_blocks) into small ranges, and when a task for the
+    // block finishes its partial gemm computation, it checks if it was the last
+    // gemm in the range, and if so, it will add all blocks of the range.
+    //
+    // After all tasks done, we need to add only these pre-aggregated blocks.
+
+    // For now we use just a single level of ranges to compute pre-aggregated
+    // partial sums, but in general we can use more layers to compute tree
+    // aggregation in parallel and reduce the size of the sequential step.
+    //
+    // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
+    // sense only if number of threads >= ~128?
+    static const Index l0_size = 4;
+    Index l0_ranges;
+
+    // Keep count of pending gemm tasks for each l0 range.
+    MaxSizeVector<std::atomic<int>> l0_state;  // [0, l0_ranges)
+
+    // Buffers allocated for each temporary block computation.
+    MaxSizeVector<Scalar*> block_buffers;  // [0, num_blocks)
+
+    template <int Alignment>
+    void processBlock(Index block_idx, Index begin, Index end) {
+      Scalar* buf = block_buffers[block_idx];
+
+      TENSOR_CONTRACTION_DISPATCH(evaluator->template evalGemmPartialWithoutOutputKernel, Alignment,
+                                  (buf, begin, end,
+                                   /*num_threads=*/internal::convert_index<int>(num_blocks)));
+
+      // Check if it was the last task in l0 range.
+      const Index l0_index = block_idx / l0_size;
+      const int v = l0_state[l0_index].fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      // If we processed the last block of the range, we can aggregate all
+      // partial results into the first block of the range.
+      if (v == 1) {
+        const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index);
+        const Index dst_block_idx = l0_index * l0_size;
+
+        if (rng_size == l0_size) {
+          addAllToBuffer<Alignment>(m * n,
+                                    /*src_buf0=*/block_buffers[dst_block_idx + 1],
+                                    /*src_buf1=*/block_buffers[dst_block_idx + 2],
+                                    /*src_buf2=*/block_buffers[dst_block_idx + 3],
+                                    /*dst_buf= */ block_buffers[dst_block_idx]);
+        } else {
+          // Aggregate blocks of potentially incomplete last range.
+          for (int i = 1; i < rng_size; ++i) {
+            addToBuffer<Alignment>(m * n,
+                                   /*src_buf=*/block_buffers[dst_block_idx + i],
+                                   /*dst_buf=*/block_buffers[dst_block_idx]);
+          }
+        }
+      }
+    }
+
+    // Aggregate partial sums from l0 ranges.
+    template <int Alignment>
+    void aggregateL0Blocks() const {
+      Index l0_index = 1;
+
+      for (; l0_index + 2 < l0_ranges; l0_index += 3) {
+        addAllToBuffer<Alignment>(m * n,
+                                  /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
+                                  /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
+                                  /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
+                                  /*dst_buf= */ block_buffers[0]);
+      }
+
+      for (; l0_index < l0_ranges; ++l0_index) {
+        addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size], block_buffers[0]);
+      }
+    }
+
+    void applyOutputKernel() const {
+      typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+      evaluator->m_output_kernel(OutputMapper(result, m), evaluator->m_tensor_contraction_params,
+                                 static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n);
+    }
+
+    // Compute block size with accounting for potentially incomplete last block.
+    Index actualBlockSize(Index block_idx) const {
+      return block_idx + 1 < num_blocks ? block_size : k + block_size - block_size * num_blocks;
+    };
+
+    // Compute range size with accounting for potentially incomplete last range.
+    Index actualRangeSize(Index num_ranges, Index range_size, Index range_idx) const {
+      eigen_assert(range_idx < num_ranges);
+      return range_idx + 1 < num_ranges ? range_size : num_blocks + range_size - range_size * num_ranges;
+    };
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf, Scalar* tgt_buf) {
+      const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const PacketReturnType src_val = internal::pload<PacketReturnType>(src_buf + i);
+        const PacketReturnType tgt_val = internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i);
+        const PacketReturnType sum = internal::padd(src_val, tgt_val);
+        internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i, sum);
+      }
+      for (; i < n; ++i) {
+        tgt_buf[i] += src_buf[i];
+      }
+    }
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n, const Scalar* src_buf0, const Scalar* src_buf1,
+                                                   const Scalar* src_buf2, Scalar* dst_buf) {
+      using ::Eigen::internal::padd;
+      using ::Eigen::internal::pload;
+      using ::Eigen::internal::ploadt;
+      using ::Eigen::internal::pstoret;
+
+      const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
+        const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
+        const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
+
+        const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
+        const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
+
+        pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
+      }
+      for (; i < n; ++i) {
+        dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
+      }
+    }
+
+    template <int Alignment>
+    void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) {
+      while (end_block_idx - start_block_idx > 1) {
+        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+        evaluator->m_device.enqueueNoNotification([this, &barrier, mid_block_idx, end_block_idx]() {
+          eval<Alignment>(barrier, mid_block_idx, end_block_idx);
+        });
+        end_block_idx = mid_block_idx;
+      }
+
+      Index block_idx = start_block_idx;
+      Index block_start = block_idx * block_size;
+      Index block_end = block_start + actualBlockSize(block_idx);
+
+      processBlock<Alignment>(block_idx, block_start, block_end);
+      barrier.Notify();
+    }
+
+    template <int Alignment>
+    void evalAsync(Index start_block_idx, Index end_block_idx) {
+      while (end_block_idx - start_block_idx > 1) {
+        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+        evaluator->m_device.enqueueNoNotification(
+            [this, mid_block_idx, end_block_idx]() { 
+              evalAsync<Alignment>(mid_block_idx, end_block_idx);
+            });
+        end_block_idx = mid_block_idx;
+      }
+
+      Index block_idx = start_block_idx;
+
+      Index block_start = block_idx * block_size;
+      Index block_end = block_start + actualBlockSize(block_idx);
+
+      processBlock<Alignment>(block_idx, block_start, block_end);
+
+      int v = num_pending_blocks.fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      if (v == 1) {
+        // Aggregate partial sums from l0 ranges.
+        aggregateL0Blocks<Alignment>();
+
+        // Apply output kernel.
+        applyOutputKernel();
+
+        // NOTE: If we call `done` callback before deleting this (context),
+        // it might deallocate Self* pointer captured by context, and we'll
+        // fail in destructor trying to deallocate temporary buffers.
+
+        // Move done call back from context before it will be destructed.
+        DoneCallback done_copy = std::move(done);
+
+        // We are confident that we are the last one who touches context.
+        delete this;
+
+        // Now safely call the done callback.
+        done_copy();
+      }
+    }
+
+    // Cost model doesn't capture well the cost associated with constructing
+    // tensor contraction mappers and computing loop bounds in gemm_pack_lhs
+    // and gemm_pack_rhs, so we specify minimum desired block size.
+    static Index blockSize(Index k, int num_threads) {
+      const auto round_up = [=](Index index) -> Index {
+        const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
+        return numext::div_ceil<Index>(index, kmultiple) * kmultiple;
+      };
+
+      const Index target_block_size = round_up(numext::div_ceil<Index>(k, num_threads));
+      const Index desired_min_block_size = 12 * packet_size;
+
+      return numext::mini<Index>(k, numext::maxi<Index>(desired_min_block_size, target_block_size));
+    }
+
+    EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete;
+    void operator=(const EvalShardedByInnerDimContext&) = delete;
+  };
+
+  // ------------------------------------------------------------------------ //
+
+  // Below are the function used by evalProductImpl heuristics, trying to select
+  // optimcal parameters for parallelization algorithm.
+
+  // Decide whether we want to shard m x n contraction by columns or by rows.
+  static bool shardByCol(Index m, Index n, Index num_threads) {
+    // Note: we are comparing both n and m against Traits::nr, it is not
+    // a mistake. We are trying to figure out how both n and m will fit into
+    // the main sharding dimension.
+
+    // Sharding by column is the default
+    // ... unless there is enough data for vectorization over rows
+    if (m / num_threads >= Traits::nr &&
+        // and not enough data for vectorization over columns
+        (n / num_threads < Traits::nr ||
+         // ... or barely enough data for vectorization over columns,
+         // but it is not evenly dividable across threads
+         (n / num_threads < 4 * Traits::nr && (n % (num_threads * Traits::nr)) != 0 &&
+          // ... and it is evenly dividable across threads for rows
+          ((m % (num_threads * Traits::nr)) == 0 ||
+           // .. or it is not evenly dividable for both dimensions but
+           // there is much more data over rows so that corner effects are
+           // mitigated.
+           (m / n >= 6)))))
+      return false;
+    // Wait, or if matrices are just substantially prolonged over the other
+    // dimension.
+    if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
+    return true;
+  }
+
+  Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, int num_threads, bool shard_by_col) const {
+    Index gm = 1;
+    Index gm1 = 1;
+    Index nm0 = numext::div_ceil(m, bm);
+    Index nm1 = nm0;
+    for (;;) {
+      // Find the next candidate for m grain size. It needs to result in
+      // different number of blocks. E.g. if we have 10 kernels, we want to try
+      // 5 and 10, but not 6, 7, 8 and 9.
+      while (gm1 <= nm0 && nm1 == numext::div_ceil(nm0, gm1)) gm1++;
+      if (gm1 > nm0) break;
+      // Check the candidate.
+      int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, shard_by_col);
+      if (res < 0) break;
+      nm1 = numext::div_ceil(nm0, gm1);
+      if (res == 0) continue;
+      // Commit new grain size.
+      gm = gm1;
+    }
+    return gm;
+  }
+
+  Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, int num_threads, bool shard_by_col) const {
+    Index gn = 1;
+    Index gn1 = 1;
+    Index nn0 = numext::div_ceil(n, bn);
+    Index nn1 = nn0;
+    for (;;) {
+      while (gn1 <= nn0 && nn1 == numext::div_ceil(nn0, gn1)) gn1++;
+      if (gn1 > nn0) break;
+      int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, shard_by_col);
+      if (res < 0) break;
+      nn1 = numext::div_ceil(nn0, gn1);
+      if (res == 0) continue;
+      gn = gn1;
+    }
+    return gn;
+  }
+
+  // checkGrain checks whether grain (gm, gn) is suitable and is better than
+  // (oldgm, oldgn).
+  int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, Index gn, Index oldgm, Index oldgn,
+                 int num_threads, bool shard_by_col) const {
+    const TensorOpCost cost = contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
+    double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(static_cast<double>(bm) * gm * bn * gn, cost);
+    // If the task is too small, then we agree on it regardless of anything
+    // else. Otherwise synchronization overheads will dominate.
+    if (taskSize < 1) return 1;
+    // If it is too large, then we reject it and all larger tasks.
+    if (taskSize > 2) return -1;
+    // Now we are in presumably good task size range.
+    // The main deciding factor here is parallelism. Consider that we have 12
+    // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
+    // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
+    // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
+    // us parallelism of 1 (we can load all cores).
+    Index nm0 = numext::div_ceil(m, bm);
+    Index nn0 = numext::div_ceil(n, bn);
+    Index new_tasks = numext::div_ceil(nm0, gm) * numext::div_ceil(nn0, gn);
+    double new_parallelism =
+        static_cast<double>(new_tasks) / (numext::div_ceil<Index>(new_tasks, num_threads) * num_threads);
+    Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn);
+    double old_parallelism =
+        static_cast<double>(old_tasks) / (numext::div_ceil<Index>(old_tasks, num_threads) * num_threads);
+    if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
+    return 0;
+  }
+
+  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, bool shard_by_col,
+                               bool prepacked) const {
+    const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size, PacketType<RhsScalar, Device>::size);
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    const double kd = static_cast<double>(bk);
+    double compute_bandwidth = computeBandwidth(false, bm, bn, bk);
+    // Computations.
+    TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size);
+    // Output stores.
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+    if (prepacked) {
+      // Packing and kernels are executed in different tasks. When we calculate
+      // task grain size we look only at kernel cost assuming that kernel
+      // is more expensive than packing.
+      return cost;
+    }
+    // Lhs/rhs loads + computations.
+    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
+    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
+    // Lhs packing memory cost does not contribute considerably to overall
+    // execution time because lhs is prefetched early and accessed sequentially.
+    if (shard_by_col)
+      lhsCost.dropMemoryCost();
+    else
+      rhsCost.dropMemoryCost();
+    return cost + lhsCost + rhsCost;
+  }
+
+  // Decide whether we want to shard m x k x n contraction over the inner
+  // (contraction) dimension (k).
+  static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, int num_threads_by_k) {
+    std::ptrdiff_t bufsize = m * n * sizeof(Scalar);
+    bool shard_by_k = false;
+    if (n == 1 ||                                      // If mat*vec or...
+        num_threads_by_k < 2 ||                        // running single threaded or...
+        num_threads_by_k < num_threads ||              // sharding by k gives less parallelism or...
+        bufsize > l3CacheSize() / num_threads_by_k ||  // need more buffer space
+        // than L3 cache or...
+        k / num_threads_by_k < 2 * Traits::nr) {  // k per thread is tiny.
+      shard_by_k = false;
+    } else if (numext::maxi(m, n) / num_threads < Traits::nr ||  // both other dimensions are tiny or...
+                                                                 // k per thread is not small and...
+               (k / num_threads_by_k > 8 * Traits::nr &&
+                // one of the outer dimensions is tiny or sharding by k offers
+                // more parallelism.
+                (numext::mini(m, n) < 2 * Traits::nr || num_threads_by_k > num_threads))) {
+      shard_by_k = true;
+    }
+    return shard_by_k;
+  }
+
+  TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
+    // Compute cost.
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size);
+    // Output stores.
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
+    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n;
+    // Since the inner gemm kernel is always sharded by column, the lhs
+    // load cost is negligible.
+    lhsCost.dropMemoryCost();
+    return cost + lhsCost + rhsCost;
+  }
+
+  int numThreadsInnerDim(Index m, Index n, Index k) const {
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    TensorOpCost cost = contractionCostPerInnerDim(m, n, k);
+    double total_parallel_cost = TensorCostModel<ThreadPoolDevice>::totalCost(k, cost);
+    // Cost of reduction step accumulating the m*n per-thread buffers into the
+    // result.
+    double reduction_cost =
+        TensorCostModel<ThreadPoolDevice>::totalCost(m * n, TensorOpCost(2, 1, 1, true, output_packet_size));
+    int num_threads = 1;
+    double min_cost = total_parallel_cost;
+    double kPerThreadOverHead = 3000;
+    double kFixedOverHead = 100000;
+    for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
+      double sequential_cost = kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
+      double parallel_cost = total_parallel_cost / nt + sequential_cost;
+      if (parallel_cost < min_cost) {
+        num_threads = nt;
+        min_cost = parallel_cost;
+      }
+    }
+    return num_threads;
+  }
+
+  double computeBandwidth(bool shard_by_col, Index bm, Index bn, Index bk) const {
+    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+    // experimentally.
+    double computeBandwidth = bk == 1                                                                          ? 4.0
+                              : (shard_by_col ? bn : bm) < Traits::nr || (shard_by_col ? bm : bn) < Traits::mr ? 2.0
+                                                                                                               : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+    // However for MULPS/ADDPS we have dependent sequence of 2 such
+    // instructions,
+    // so overall bandwidth is 1.0.
+    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
+    return computeBandwidth;
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_USE_THREADS
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
new file mode 100644
index 00000000000..5e4a586d8b1
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -0,0 +1,416 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorConversionOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor conversion class. This class makes it possible to vectorize
+ * type casting operations when the number of scalars per packet in the source
+ * and the destination type differ
+ */
+namespace internal {
+template <typename TargetType, typename XprType>
+struct traits<TensorConversionOp<TargetType, XprType> > {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef TargetType Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = traits<XprType>::NumDimensions;
+  static constexpr int Layout = traits<XprType>::Layout;
+  enum { Flags = 0 };
+  typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
+};
+
+template <typename TargetType, typename XprType>
+struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense> {
+  typedef const TensorConversionOp<TargetType, XprType>& type;
+};
+
+template <typename TargetType, typename XprType>
+struct nested<TensorConversionOp<TargetType, XprType>, 1,
+              typename eval<TensorConversionOp<TargetType, XprType> >::type> {
+  typedef TensorConversionOp<TargetType, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct PacketConverter;
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {}
+
+  template <int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {}
+
+  template <int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {}
+
+  template <int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 8, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {}
+
+  template <int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + 1 * SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    SrcPacket src5 = m_impl.template packet<LoadMode>(index + 4 * SrcPacketSize);
+    SrcPacket src6 = m_impl.template packet<LoadMode>(index + 5 * SrcPacketSize);
+    SrcPacket src7 = m_impl.template packet<LoadMode>(index + 6 * SrcPacketSize);
+    SrcPacket src8 = m_impl.template packet<LoadMode>(index + 7 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4, src5, src6, src7, src8);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
+
+  template <int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    // Only call m_impl.packet() when we have direct access to the underlying data. This
+    // ensures that we don't compute the subexpression twice. We may however load some
+    // coefficients twice, but in practice this doesn't negatively impact performance.
+    if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) {
+      // Force unaligned memory loads since we can't ensure alignment anymore
+      return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
+    } else {
+      const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+      typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+      typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
+      internal::scalar_cast_op<SrcType, TgtType> converter;
+      EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < TgtPacketSize; ++i) {
+        values[i] = converter(m_impl.coeff(index + i));
+      }
+      TgtPacket rslt = internal::pload<TgtPacket>(values);
+      return rslt;
+    }
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+  const typename TensorEvaluator::Index m_maxIndex;
+};
+
+template <typename TargetType, typename XprType>
+class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename internal::traits<TensorConversionOp>::Scalar Scalar;
+  typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorConversionOp>::Index Index;
+  typedef typename internal::nested<TensorConversionOp>::type Nested;
+  typedef Scalar CoeffReturnType;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) : m_xpr(xpr) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+};
+
+template <bool SameType, typename Eval, typename EvalPointerType>
+struct ConversionSubExprEval {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
+    impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+};
+
+template <typename Eval, typename EvalPointerType>
+struct ConversionSubExprEval<true, Eval, EvalPointerType> {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { return impl.evalSubExprsIfNeeded(data); }
+};
+
+#ifdef EIGEN_USE_THREADS
+template <bool SameType, typename Eval, typename EvalPointerType, typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
+    impl.evalSubExprsIfNeededAsync(nullptr, std::move(done));
+  }
+};
+
+template <typename Eval, typename EvalPointerType, typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType, EvalSubExprsCallback> {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
+    impl.evalSubExprsIfNeededAsync(data, std::move(done));
+  }
+};
+#endif
+
+namespace internal {
+
+template <typename SrcType, typename TargetType, bool IsSameT>
+struct CoeffConv {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl,
+                                                              Index index) {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    return converter(impl.coeff(index));
+  }
+};
+
+template <typename SrcType, typename TargetType>
+struct CoeffConv<SrcType, TargetType, true> {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl,
+                                                              Index index) {
+    return impl.coeff(index);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT>
+struct PacketConv {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+  static constexpr int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl,
+                                                                Index index) {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    EIGEN_ALIGN_MAX std::remove_const_t<TargetType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = converter(impl.coeff(index + i));
+    }
+    TargetPacket rslt = internal::pload<TargetPacket>(values);
+    return rslt;
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl,
+                                                                Index index) {
+    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+    PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket, SrcCoeffRatio, TgtCoeffRatio> converter(
+        impl);
+    return converter.template packet<LoadMode>(index);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> {
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+  static constexpr int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl,
+                                                                Index index) {
+    EIGEN_ALIGN_MAX std::remove_const_t<TargetType> values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index + i);
+    return internal::pload<TargetPacket>(values);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl,
+                                                                Index index) {
+    return impl.template packet<LoadMode>(index);
+  }
+};
+
+}  // namespace internal
+
+// Eval as rvalue
+template <typename TargetType, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> {
+  typedef TensorConversionOp<TargetType, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef TargetType Scalar;
+  typedef TargetType CoeffReturnType;
+  typedef internal::remove_all_t<typename internal::traits<ArgType>::Scalar> SrcType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename PacketType<SrcType, Device>::type PacketSourceType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static constexpr bool IsSameType = internal::is_same<TargetType, SrcType>::value;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess =
+#ifndef EIGEN_USE_SYCL
+        true,
+#else
+        TensorEvaluator<ArgType, Device>::PacketAccess &
+        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+#endif
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    RawAccess = false
+  };
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  static constexpr int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  struct TensorConversionOpBlockFactory {
+    template <typename ArgXprType>
+    struct XprType {
+      typedef TensorConversionOp<TargetType, const ArgXprType> type;
+    };
+
+    template <typename ArgXprType>
+    typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
+      return typename XprType<ArgXprType>::type(expr);
+    }
+  };
+
+  typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory, ArgTensorBlock> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) {
+    ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType,
+                               EvalSubExprsCallback>::run(m_impl, data, std::move(done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl, index);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    // If we are not going to do the cast, we just need to check that base
+    // TensorEvaluator has packet access. Otherwise we also need to make sure,
+    // that we have an implementation of vectorized cast.
+    const bool Vectorizable = IsSameType ? TensorEvaluator<ArgType, Device>::PacketAccess
+                                         : int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+                                               int(internal::type_casting_traits<SrcType, TargetType>::VectorizedCast);
+
+    return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode, Vectorizable, IsSameType>::run(m_impl,
+                                                                                                             index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
+    if (vectorized) {
+      const double SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+      const double TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+      return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
+             TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
+    } else {
+      return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return m_impl.getResourceRequirements();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    return TensorBlock(m_impl.block(desc, scratch), TensorConversionOpBlockFactory());
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  /// required by sycl in order to extract the sycl accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
new file mode 100644
index 00000000000..26984b69090
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -0,0 +1,1123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorConvolution
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+namespace internal {
+
+template <typename Index, typename InputDims, int NumKernelDims, int Layout>
+class IndexMapper {
+ public:
+  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
+              const array<Index, NumKernelDims>& indices) {
+    array<Index, NumDims> dimensions = input_dims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = indices[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      dimensions[index] = result_dim;
+    }
+
+    array<Index, NumDims> inputStrides;
+    array<Index, NumDims> outputStrides;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputStrides[0] = 1;
+      outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
+        outputStrides[i] = outputStrides[i - 1] * dimensions[i - 1];
+      }
+    } else {
+      inputStrides[NumDims - 1] = 1;
+      outputStrides[NumDims - 1] = 1;
+      for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
+        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
+      }
+    }
+
+    array<Index, NumDims> gpuInputDimensions;
+    array<Index, NumDims> gpuOutputDimensions;
+    array<Index, NumDims> tmp = dimensions;
+    array<Index, NumDims> ordering;
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = i + offset;
+      ordering[index] = indices[i];
+      tmp[indices[i]] = -1;
+      gpuInputDimensions[index] = input_dims[indices[i]];
+      gpuOutputDimensions[index] = dimensions[indices[i]];
+    }
+
+    int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? NumKernelDims : 0;
+    for (int i = 0; i < NumDims; ++i) {
+      if (tmp[i] >= 0) {
+        ordering[written] = i;
+        gpuInputDimensions[written] = input_dims[i];
+        gpuOutputDimensions[written] = dimensions[i];
+        ++written;
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = inputStrides[ordering[i]];
+      m_outputStrides[i] = outputStrides[ordering[i]];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims; ++i) {
+        if (i > NumKernelDims) {
+          m_gpuInputStrides[i] = m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
+          m_gpuOutputStrides[i] = m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
+        } else {
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
+        }
+      }
+    } else {
+      for (int i = NumDims - 1; i >= 0; --i) {
+        if (static_cast<size_t>(i + 1) < offset) {
+          m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
+          m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
+        } else {
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
+        }
+      }
+    }
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_gpuInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
+      }
+      if (NumKernelDims < NumDims) {
+        inputIndex += p * m_inputStrides[NumKernelDims];
+      }
+    } else {
+      std::ptrdiff_t limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_gpuInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
+      }
+      inputIndex += p * m_inputStrides[limit];
+    }
+    return inputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
+    Index outputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_gpuOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
+      }
+      if (NumKernelDims < NumDims) {
+        outputIndex += p * m_outputStrides[NumKernelDims];
+      }
+    } else {
+      std::ptrdiff_t limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_gpuOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
+      }
+      outputIndex += p * m_outputStrides[limit];
+    }
+    return outputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + k * m_inputStrides[offset + 2];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + k * m_outputStrides[offset + 2];
+  }
+
+ private:
+  static constexpr int NumDims = internal::array_size<InputDims>::value;
+  array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_gpuInputStrides;
+  array<Index, NumDims> m_gpuOutputStrides;
+};
+
+template <typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename InputXprType::Scalar, typename KernelXprType::Scalar>::ret Scalar;
+  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<InputXprType>::Index, typename traits<KernelXprType>::Index>::type
+      Index;
+  typedef typename InputXprType::Nested LhsNested;
+  typedef typename KernelXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  static constexpr int NumDimensions = traits<InputXprType>::NumDimensions;
+  static constexpr int Layout = traits<InputXprType>::Layout;
+  typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
+                             typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>
+      PointerType;
+
+  enum { Flags = 0 };
+};
+
+template <typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> {
+  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template <typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1,
+              typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> {
+  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+}  // end namespace internal
+
+template <typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp
+    : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel,
+                                                            const Indices& dims)
+      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices& indices() const { return m_indices; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename InputXprType::Nested>& inputExpression()
+      const {
+    return m_input_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename KernelXprType::Nested>& kernelExpression()
+      const {
+    return m_kernel_xpr;
+  }
+
+ protected:
+  typename InputXprType::Nested m_input_xpr;
+  typename KernelXprType::Nested m_kernel_xpr;
+  const Indices m_indices;
+};
+
+template <typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> {
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static constexpr int NumDims =
+      internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<InputArgType, Device>::Layout;
+  enum {
+    IsAligned =
+        int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
+    PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) &
+                   int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_inputImpl(op.inputExpression(), device),
+        m_kernelImpl(op.kernelExpression(), device),
+        m_kernelArg(op.kernelExpression()),
+        m_kernel(NULL),
+        m_local_kernel(false),
+        m_device(device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
+      }
+    } else {
+      m_inputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
+      }
+    }
+
+    m_dimensions = m_inputImpl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumKernelDims; ++i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i > 0) {
+          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
+        } else {
+          m_kernelStride[0] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      for (int i = NumKernelDims - 1; i >= 0; --i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i < NumKernelDims - 1) {
+          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
+        } else {
+          m_kernelStride[NumKernelDims - 1] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    preloadKernel();
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  void evalTo(typename XprType::Scalar* buffer) {
+    evalSubExprsIfNeeded(NULL);
+    for (int i = 0; i < dimensions().TotalSize(); ++i) {
+      buffer[i] += coeff(i);
+    }
+    cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    CoeffReturnType result = CoeffReturnType(0);
+    convolve(firstInput(index), 0, NumKernelDims - 1, result);
+    return result;
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const {
+    Index indices[2] = {index, index + PacketSize - 1};
+    Index startInputs[2] = {0, 0};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    }
+    startInputs[0] += indices[0];
+    startInputs[1] += indices[1];
+
+    if (startInputs[1] - startInputs[0] == PacketSize - 1) {
+      PacketReturnType result = internal::pset1<PacketReturnType>(0);
+      convolvePacket(startInputs[0], 0, NumKernelDims - 1, result);
+      return result;
+    } else {
+      EIGEN_ALIGN_MAX Scalar data[PacketSize];
+      data[0] = Scalar(0);
+      convolve(startInputs[0], 0, NumKernelDims - 1, data[0]);
+      for (int i = 1; i < PacketSize - 1; ++i) {
+        data[i] = Scalar(0);
+        convolve(firstInput(index + i), 0, NumKernelDims - 1, data[i]);
+      }
+      data[PacketSize - 1] = Scalar(0);
+      convolve(startInputs[1], 0, NumKernelDims - 1, data[PacketSize - 1]);
+      return internal::pload<PacketReturnType>(data);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    }
+    startInput += index;
+    return startInput;
+  }
+
+  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolve(input, kernel, DimIndex - 1, accum);
+      } else {
+        accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+      }
+    }
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolvePacket(input, kernel, DimIndex - 1, accum);
+      } else {
+        accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input),
+                                        internal::pset1<Packet>(m_kernel[kernel]), accum);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  array<Index, NumDims> m_inputStride;
+  array<Index, NumDims> m_outputStride;
+
+  array<Index, NumKernelDims> m_indexStride;
+  array<Index, NumKernelDims> m_kernelStride;
+  TensorEvaluator<InputArgType, Device> m_inputImpl;
+  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+  Dimensions m_dimensions;
+
+  KernelArgType m_kernelArg;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+// Use an optimized implementation of the evaluation code for GPUs whenever possible.
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+template <int StaticKernelSize>
+struct GetKernelSize {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int /*kernelSize*/) const { return StaticKernelSize; }
+};
+template <>
+struct GetKernelSize<Dynamic> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int kernelSize) const { return kernelSize; }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
+    InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize,
+    float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
+  extern __shared__ float s[];
+#endif
+
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_plane = blockIdx.y * blockDim.y;
+  const int plane_stride = blockDim.y * gridDim.y;
+
+  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
+    // Load inputs to shared memory
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = threadIdx.y * num_x_input;
+#pragma unroll
+    for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+      const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x);
+      s[i + plane_kernel_offset] = eval.coeff(tensor_index);
+    }
+
+    __syncthreads();
+
+    // Compute the convolution
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+
+#pragma unroll
+    for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+      const int kernel_offset = plane_kernel_offset + i;
+      float result = 0.0f;
+#pragma unroll
+      for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
+        result += s[k + kernel_offset] * kernel[k];
+      }
+      const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x);
+      buffer[tensor_index] = result;
+    }
+    __syncthreads();
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
+    InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY,
+    const int kernelSizeX, const int kernelSizeY, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
+  extern __shared__ float s[];
+#endif
+
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_y = blockIdx.y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
+  const int num_y_output = last_y - first_y + 1;
+
+  const int first_plane = blockIdx.z * blockDim.z;
+  const int plane_stride = blockDim.z * gridDim.z;
+
+  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = threadIdx.z * num_y_input;
+
+// Load inputs to shared memory
+#pragma unroll
+    for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+      const int input_offset = num_x_input * (j + plane_kernel_offset);
+#pragma unroll
+      for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+        const int tensor_index =
+            plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y);
+        s[i + input_offset] = eval.coeff(tensor_index);
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+
+#pragma unroll
+    for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+#pragma unroll
+      for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+        float result = 0.0f;
+#pragma unroll
+        for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
+          const int kernel_offset = kernelSizeX * l;
+          const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
+#pragma unroll
+          for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
+            result += s[k + input_offset] * kernel[k + kernel_offset];
+          }
+        }
+        const int tensor_index =
+            plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y);
+        buffer[tensor_index] = result;
+      }
+    }
+
+    __syncthreads();
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
+    InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper,
+    const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY,
+    const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
+    const size_t kernelSizeZ, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
+  extern __shared__ float s[];
+#endif
+
+  // Load inputs to shared memory
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + kernelSizeX;
+
+  const int first_y = blockIdx.y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + kernelSizeY;
+
+  const int first_z = blockIdx.z * maxZ;
+  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+  const int num_z_input = last_z - first_z + kernelSizeZ;
+
+  for (int p = 0; p < numPlanes; ++p) {
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = 0;
+
+    for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
+      for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+        for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+          const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                                            i + first_x, j + first_y, k + first_z);
+          s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
+        }
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int num_z_output = last_z - first_z + 1;
+    const int num_y_output = last_y - first_y + 1;
+    const int num_x_output = last_x - first_x + 1;
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+
+    for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
+      for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+        for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+          float result = 0.0f;
+          for (int n = 0; n < kernelSizeZ; ++n) {
+            for (int m = 0; m < kernelSizeY; ++m) {
+              for (int l = 0; l < kernelSizeX; ++l) {
+                result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] *
+                          kernel[l + kernelSizeX * (m + kernelSizeY * n)];
+              }
+            }
+          }
+          const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
+                                                             i + first_x, j + first_y, k + first_z);
+          buffer[tensor_index] = result;
+        }
+      }
+    }
+    __syncthreads();
+  }
+};
+
+template <typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> {
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static constexpr int NumDims =
+      internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
+  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
+
+  static constexpr int Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout;
+  enum {
+    IsAligned =
+        TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const GpuDevice& device)
+      : m_inputImpl(op.inputExpression(), device),
+        m_kernelImpl(op.kernelExpression(), device),
+        m_kernelArg(op.kernelExpression()),
+        m_indices(op.indices()),
+        m_buf(NULL),
+        m_kernel(NULL),
+        m_local_kernel(false),
+        m_device(device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) ==
+                         static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  static unsigned int ceil(unsigned int num, unsigned int denom) {
+    const unsigned int rounded_toward_zero = num / denom;
+    if (num > rounded_toward_zero * denom) {
+      return rounded_toward_zero + 1;
+    }
+    return rounded_toward_zero;
+  }
+
+  void executeEval(Scalar* data) const {
+    typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
+
+    const int maxSharedMem = m_device.sharedMemPerBlock();
+    const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
+    const int warpSize = 32;
+
+    switch (NumKernelDims) {
+      case 1: {
+        const int kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        const int numX = dimensions()[m_indices[0]];
+        const int numP = dimensions().TotalSize() / numX;
+        int maxX;
+        dim3 block_size;
+
+        const int single_stride_dim =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1;
+        if (m_indices[0] == single_stride_dim) {
+          // Maximum the reuse
+          const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
+          maxX = numext::mini<int>(inner_dim, numX);
+          const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+          block_size.x = numext::mini(maxThreadsPerBlock, maxX);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
+        } else {
+          // Read as much as possible alongside the inner most dimension, that is the plane
+          const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
+          const int maxP = numext::mini<int>(inner_dim, numP);
+          maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+
+          block_size.x = numext::mini(warpSize, maxX);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
+        }
+
+        const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
+        gpu_assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
+
+        dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
+
+        // cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
+        // num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: "
+        // << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 1> indices{m_indices[0]};
+        const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]};
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        switch (kernel_size) {
+          case 4: {
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
+                              num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
+                              numX, maxX, 4, data);
+            break;
+          }
+          case 7: {
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
+                              num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
+                              numX, maxX, 7, data);
+            break;
+          }
+          default: {
+            LAUNCH_GPU_KERNEL(
+                (EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),
+                num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
+                kernel_size, data);
+          }
+        }
+        break;
+      }
+
+      case 2: {
+        const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
+        const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numP = dimensions().TotalSize() / (numX * numY);
+
+        const float scaling_factor =
+            sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
+
+        // Snap maxX to warp size
+        int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
+        const int maxX = numext::mini<int>(inner_dim, numX);
+        const int maxY =
+            numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+        const int maxP = numext::mini<int>(
+            maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+
+        dim3 block_size;
+        block_size.x = numext::mini(1024, maxX);
+        block_size.y = numext::mini<int>(1024 / block_size.x, maxY);
+        block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxP);
+
+        const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
+        gpu_assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int num_y_blocks = ceil(numY, maxY);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
+
+        dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
+
+        // cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << "
+        // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
+        // " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << "
+        // shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]};
+        const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]};
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        switch (kernel_size_x) {
+          case 4: {
+            switch (kernel_size_y) {
+              case 7: {
+                LAUNCH_GPU_KERNEL(
+                    (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
+                    num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
+                    numY, maxY, 4, 7, data);
+                break;
+              }
+              default: {
+                LAUNCH_GPU_KERNEL(
+                    (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),
+                    num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
+                    numY, maxY, 4, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          case 7: {
+            switch (kernel_size_y) {
+              case 4: {
+                LAUNCH_GPU_KERNEL(
+                    (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
+                    num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
+                    numY, maxY, 7, 4, data);
+                break;
+              }
+              default: {
+                LAUNCH_GPU_KERNEL(
+                    (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),
+                    num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
+                    numY, maxY, 7, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          default: {
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,
+                                                        Dynamic, Dynamic>),
+                              num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
+                              numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            break;
+          }
+        }
+        break;
+      }
+
+      case 3: {
+        const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
+        const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
+        const int idxZ = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
+
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+        const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numZ = dimensions()[m_indices[idxZ]];
+        const int numP = dimensions().TotalSize() / (numX * numY * numZ);
+
+        const int maxX = numext::mini<int>(
+            128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1,
+                                   numX));
+        const int maxY = numext::mini<int>(
+            128, numext::mini<int>(
+                     maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1,
+                     numY));
+        const int maxZ = numext::mini<int>(
+            128, numext::mini<int>(
+                     maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) -
+                         kernel_size_z + 1,
+                     numZ));
+
+        dim3 block_size;
+        block_size.x = numext::mini(32, maxX);
+        block_size.y = numext::mini(32, maxY);
+        block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxZ);
+        dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
+
+        const int shared_mem =
+            (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
+        gpu_assert(shared_mem <= maxSharedMem);
+
+        // cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << "
+        // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
+        // " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() <<
+        // endl;
+        const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]};
+        const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY],
+                                          m_kernelImpl.dimensions()[idxZ]};
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
+                          num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX,
+                          maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        break;
+      }
+
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
+                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+  }
+
+ private:
+  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
+  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
+  KernelArgType m_kernelArg;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  Scalar* m_buf;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+
+  const GpuDevice& m_device;
+};
+#endif
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
new file mode 100644
index 00000000000..88ae9f4bb5b
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -0,0 +1,546 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorConvolution
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+
+enum class convolution_type { CONV1D, CONV2D, CONV3D };
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
+struct EigenConvolutionKernel;
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV1D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
+  const size_t kernelSize;
+  const cl::sycl::range<2> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
+                         const size_t kernelSize_, const cl::sycl::range<2> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernelSize(kernelSize_),
+        input_range(input_range_) {}
+
+  template <typename BooleanDim2>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
+    return (boolean_check[0] && boolean_check[1]);
+  }
+  void operator()(cl::sycl::nd_item<2> itemID) const {
+    auto buffer_ptr = buffer_acc;
+    auto kernel_ptr = kernel_filter;
+    // the required row to be calculated for the for each plane in shered memory
+    const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
+    const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
+    const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
+    const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
+    /// fill the shared memory
+    for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
+      const size_t local_index = i + plane_kernel_offset;
+      const size_t tensor_index =
+          plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
+
+      local_acc[local_index] =
+          (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
+              ? device_evaluator.coeff(tensor_index)
+              : CoeffReturnType(0);
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // calculate the convolution // output start x
+    const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
+    if (boundary_check(itemID.get_global_id() < input_range)) {
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+      const size_t index = plane_kernel_offset + itemID.get_local_id(0);
+      for (size_t k = 0; k < kernelSize; ++k) {
+        result += (local_acc[k + index] * kernel_ptr[k]);
+      }
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV2D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<2> kernel_size;
+  const cl::sycl::range<3> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+
+  void operator()(cl::sycl::nd_item<3> itemID) const {
+    auto buffer_ptr = buffer_acc;
+    auto kernel_ptr = kernel_filter;
+    // the required row to be calculated for the for each plane in shered memory
+    const auto num_input = cl::sycl::range<2>{
+        (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
+
+    const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
+    const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
+
+    const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                 itemID.get_group(1) * itemID.get_local_range()[1]};
+
+    // fill the local memory
+    bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
+    for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+      const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
+      bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
+      for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+        const size_t local_index = i + local_input_offset;
+        const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                                             i + input_offset[0], j + input_offset[1]);
+        local_acc[local_index] =
+            (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && in_range_dim1 && in_range_dim2)
+                ? device_evaluator.coeff(tensor_index)
+                : CoeffReturnType(0);
+      }
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // output offset start for each thread
+    const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
+
+    if (boundary_check(itemID.get_global_id() < input_range)) {
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+
+      for (size_t j = 0; j < kernel_size[1]; j++) {
+        size_t kernel_offset = kernel_size[0] * j;
+        const size_t index =
+            (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
+        for (size_t i = 0; i < kernel_size[0]; i++) {
+          result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
+        }
+      }
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
+                                                             itemID.get_local_id(1) + output_offset[1]);
+
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV3D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<3> kernel_size;
+  const cl::sycl::range<3> input_range;
+  const size_t numP;
+
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
+                         const size_t numP_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_),
+        numP(numP_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+  void operator()(cl::sycl::nd_item<3> itemID) const {
+    auto buffer_ptr = buffer_acc;
+    auto kernel_ptr = kernel_filter;
+    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
+
+    const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
+
+    const auto output_offset =
+        cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
+
+    for (size_t p = 0; p < numP; p++) {
+      /// fill the shared memory
+      const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+      for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
+        size_t local_index_dim2 = num_input[0] * num_input[1] * k;
+        bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
+        for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+          bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
+          size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
+          for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+            bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
+            const size_t local_index = local_index_dim1 + i;
+            const size_t tensor_index =
+                plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                         i + input_offset[0], j + input_offset[1], k + input_offset[2]);
+            local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
+          }
+        }
+      }
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+      // calculate the convolution
+
+      if (boundary_check(itemID.get_global_id() < input_range)) {
+        CoeffReturnType result = static_cast<CoeffReturnType>(0);
+        for (size_t k = 0; k < kernel_size[2]; k++) {
+          for (size_t j = 0; j < kernel_size[1]; j++) {
+            for (size_t i = 0; i < kernel_size[0]; i++) {
+              const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
+              const size_t local_index =
+                  ((i + itemID.get_local_id(0)) +
+                   num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
+
+              result += (local_acc[local_index] * kernel_ptr[kernel_index]);
+            }
+          }
+        }
+        const size_t tensor_index =
+            indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
+            indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
+        buffer_ptr[tensor_index] = result;
+      }
+
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+  }
+};
+
+template <typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static constexpr int NumDims =
+      internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
+  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
+  typedef const Eigen::SyclDevice Device;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
+
+  static constexpr int Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
+                TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
+      : m_inputImpl(op.inputExpression(), device),
+        m_kernelArg(op.kernelExpression()),
+        m_kernelImpl(op.kernelExpression(), device),
+        m_indices(op.indices()),
+        m_buf(NULL),
+        m_kernel(NULL),
+        m_local_kernel(false),
+        m_device(device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
+                         static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
+        m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (EvaluatorPointerType)m_device.get(
+          (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate_temp(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate_temp(m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    typename KernelStorage::Type in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(m_device.get(local), m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
+    typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
+    typedef typename InputEvaluator::Dimensions InputDims;
+    switch (NumKernelDims) {
+      case 1: {
+        const size_t numX = dimensions()[m_indices[0]];
+        const size_t numP = dimensions().TotalSize() / numX;
+        const auto input_dim = std::array<size_t, 2>{numX, numP};
+        auto global_range = cl::sycl::range<2>{1, 1};
+        auto local_range = cl::sycl::range<2>{1, 1};
+        const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 1> indices{{m_indices[0]}};
+        const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
+            ConvKernel;
+
+        m_device
+            .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+                m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
+                indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]))
+            .wait();
+        break;
+      }
+
+      case 2: {
+        auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
+        auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numP = dimensions().TotalSize() / (numX * numY);
+        auto input_dim = std::array<size_t, 3>{numX, numY, numP};
+
+        auto global_range = cl::sycl::range<3>{1, 1, 1};
+        auto local_range = cl::sycl::range<3>{1, 1, 1};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+
+        const size_t local_memory_size =
+            (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
+        const array<Index, 2> kernel_dims{
+            {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
+            ConvKernel;
+        m_device
+            .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+                m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+                indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]})
+            .wait();
+        break;
+      }
+
+      case 3: {
+        auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
+
+        auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
+
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
+        auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
+        const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
+
+        const array<Index, 3> indices{
+            {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
+        const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
+                                           m_kernelImpl.dimensions()[kernel_index[1]],
+                                           m_kernelImpl.dimensions()[kernel_index[2]]}};
+
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        auto global_range = cl::sycl::range<3>{1, 1, 1};
+        auto local_range = cl::sycl::range<3>{1, 1, 1};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        auto local_memory_range = (local_range + kernel_size - 1);
+        const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
+
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
+            ConvKernel;
+        m_device
+            .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+                m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+                indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP)
+            .wait();
+        break;
+      }
+
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
+                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_buf != NULL);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
+    eigen_assert(m_buf != NULL);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+  }
+
+ private:
+  // No assignment (copies are needed by the kernels)
+  TensorEvaluator &operator=(const TensorEvaluator &);
+  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
+  KernelArgType m_kernelArg;
+  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  EvaluatorPointerType m_buf;
+  typename KernelStorage::Type m_kernel;
+  bool m_local_kernel;
+  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
+};  // namespace Eigen
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
new file mode 100644
index 00000000000..c0f13337a60
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A cost model used to limit the number of threads used for evaluating
+ * tensor expression.
+ *
+ */
+
+// Class storing the cost of evaluating a tensor expression in terms of the
+// estimated number of operand bytes loads, bytes stored, and compute cycles.
+class TensorOpCost {
+ public:
+  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
+  // model based on minimal reciprocal throughput numbers from Intel or
+  // Agner Fog's tables would be better than what is there now.
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
+    return internal::functor_traits<internal::scalar_product_op<ArgType, ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
+    return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
+    return internal::functor_traits<internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
+    return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
+  }
+  template <typename SrcType, typename TargetType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
+    return internal::functor_traits<internal::scalar_cast_op<SrcType, TargetType> >::Cost;
+  }
+
+  EIGEN_DEVICE_FUNC TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+  EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
+      : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(compute_cycles) {}
+
+  EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized,
+                                 double packet_size)
+      : bytes_loaded_(bytes_loaded),
+        bytes_stored_(bytes_stored),
+        compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) {
+    eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
+    eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
+    eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { return bytes_loaded_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { return bytes_stored_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { return compute_cycles_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost,
+                                                          double compute_cost) const {
+    return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_;
+  }
+
+  // Drop memory access component. Intended for cases when memory accesses are
+  // sequential or are completely masked by computations.
+  EIGEN_DEVICE_FUNC void dropMemoryCost() {
+    bytes_loaded_ = 0;
+    bytes_stored_ = 0;
+  }
+
+  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
+  }
+
+  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(const TensorOpCost& rhs) {
+    bytes_loaded_ += rhs.bytes_loaded();
+    bytes_stored_ += rhs.bytes_stored();
+    compute_cycles_ += rhs.compute_cycles();
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
+    bytes_loaded_ *= rhs;
+    bytes_stored_ *= rhs;
+    compute_cycles_ *= rhs;
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost& rhs) {
+    lhs += rhs;
+    return lhs;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs) {
+    lhs *= rhs;
+    return lhs;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs) {
+    rhs *= lhs;
+    return rhs;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
+    return os << "[bytes_loaded = " << tc.bytes_loaded() << ", bytes_stored = " << tc.bytes_stored()
+              << ", compute_cycles = " << tc.compute_cycles() << "]";
+  }
+
+ private:
+  double bytes_loaded_;
+  double bytes_stored_;
+  double compute_cycles_;
+};
+
+// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
+// in [1:max_threads] instead of just switching multi-threading off for small
+// work units.
+template <typename Device>
+class TensorCostModel {
+ public:
+  // Scaling from Eigen compute cost to device cycles.
+  static const int kDeviceCyclesPerComputeCycle = 1;
+
+  // Costs in device cycles.
+  static const int kStartupCycles = 100000;
+  static const int kPerThreadCycles = 100000;
+  static const int kTaskSize = 40000;
+
+  // Returns the number of threads in [1:max_threads] to use for
+  // evaluating an expression with the given output size and cost per
+  // coefficient.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost& cost_per_coeff,
+                                                              int max_threads) {
+    double cost = totalCost(output_size, cost_per_coeff);
+    double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+    // Make sure we don't invoke undefined behavior when we convert to an int.
+    threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
+    return numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads)));
+  }
+
+  // taskSize assesses parallel task size.
+  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
+  // granularity needs to be increased to mitigate parallelization overheads.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost& cost_per_coeff) {
+    return totalCost(output_size, cost_per_coeff) / kTaskSize;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size,
+                                                                const TensorOpCost& cost_per_coeff) {
+    // Cost of memory fetches from L2 cache. 64 is typical cache line size.
+    // 11 is L2 cache latency on Haswell.
+    // We don't know whether data is in L1, L2 or L3. But we are most interested
+    // in single-threaded computational time around 100us-10ms (smaller time
+    // is too small for parallelization, larger time is not interesting
+    // either because we are probably using all available threads already).
+    // And for the target time range, L2 seems to be what matters. Data set
+    // fitting into L1 is too small to take noticeable time. Data set fitting
+    // only into L3 presumably will take more than 10ms to load and process.
+    const double kLoadCycles = 1.0 / 64 * 11;
+    const double kStoreCycles = 1.0 / 64 * 11;
+    // Scaling from Eigen compute cost to device cycles.
+    return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle);
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
new file mode 100644
index 00000000000..c8e1df73394
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -0,0 +1,309 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorCustomUnaryOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor custom class.
+ *
+ *
+ */
+namespace internal {
+template <typename CustomUnaryFunc, typename XprType>
+struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::StorageKind StorageKind;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = traits<XprType>::NumDimensions;
+  static constexpr int Layout = traits<XprType>::Layout;
+  typedef typename traits<XprType>::PointerType PointerType;
+};
+
+template <typename CustomUnaryFunc, typename XprType>
+struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense> {
+  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename CustomUnaryFunc, typename XprType>
+struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename CustomUnaryFunc, typename XprType>
+class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorCustomUnaryOp>::type Nested;
+  typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorCustomUnaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func)
+      : m_expr(expr), m_func(func) {}
+
+  EIGEN_DEVICE_FUNC const CustomUnaryFunc& func() const { return m_func; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_expr; }
+
+ protected:
+  typename XprType::Nested m_expr;
+  const CustomUnaryFunc m_func;
+};
+
+// Eval as rvalue
+template <typename CustomUnaryFunc, typename XprType, typename Device>
+struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device> {
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType;
+  typedef typename internal::traits<ArgType>::Index Index;
+  static constexpr int NumDims = internal::traits<ArgType>::NumDimensions;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef std::remove_const_t<typename ArgType::Scalar> Scalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<XprType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
+      : m_op(op), m_device(device), m_result(NULL) {
+    m_dimensions = op.func().dimensions(op.expression());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<EvaluatorPointerType>(
+          m_device.get((CoeffReturnType*)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result) {
+      m_device.deallocate_temp(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+ protected:
+  void evalTo(EvaluatorPointerType data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
+    m_op.func().eval(m_op.expression(), result, m_device);
+  }
+
+  Dimensions m_dimensions;
+  const ArgType m_op;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_result;
+};
+
+/** \class TensorCustomBinaryOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor custom class.
+ *
+ *
+ */
+namespace internal {
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {
+  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  static constexpr int NumDimensions = traits<LhsXprType>::NumDimensions;
+  static constexpr int Layout = traits<LhsXprType>::Layout;
+  typedef std::conditional_t<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                             typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>
+      PointerType;
+};
+
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense> {
+  typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type;
+};
+
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {
+  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+class TensorCustomBinaryOp
+    : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorCustomBinaryOp>::type Nested;
+  typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorCustomBinaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs,
+                                                             const CustomBinaryFunc& func)
+
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {}
+
+  EIGEN_DEVICE_FUNC const CustomBinaryFunc& func() const { return m_func; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename LhsXprType::Nested>& lhsExpression() const {
+    return m_lhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename RhsXprType::Nested>& rhsExpression() const {
+    return m_rhs_xpr;
+  }
+
+ protected:
+  typename LhsXprType::Nested m_lhs_xpr;
+  typename RhsXprType::Nested m_rhs_xpr;
+  const CustomBinaryFunc m_func;
+};
+
+// Eval as rvalue
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device>
+struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device> {
+  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType;
+  typedef typename internal::traits<XprType>::Index Index;
+  static constexpr int NumDims = internal::traits<XprType>::NumDimensions;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<LhsXprType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_op(op), m_device(device), m_result(NULL) {
+    m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<EvaluatorPointerType>(
+          m_device.get((CoeffReturnType*)m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result != NULL) {
+      m_device.deallocate_temp(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+ protected:
+  void evalTo(EvaluatorPointerType data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
+    m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
+  }
+
+  Dimensions m_dimensions;
+  const XprType m_op;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_result;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
new file mode 100644
index 00000000000..9898fdcfbb5
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -0,0 +1,139 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorDevice
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Pseudo expression providing an operator = that will evaluate its argument
+ * on the specified computing 'device' (GPU, thread pool, ...)
+ *
+ * Example:
+ *    C.device(EIGEN_GPU) = A + B;
+ *
+ * Todo: operator *= and /=.
+ */
+
+template <typename ExpressionType, typename DeviceType>
+class TensorDevice {
+ public:
+  TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice)
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    Assign assign(m_expression, other);
+    internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+    return *this;
+  }
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+    typedef typename OtherDerived::Scalar Scalar;
+    typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+    Sum sum(m_expression, other);
+    typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+    Assign assign(m_expression, sum);
+    internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+    return *this;
+  }
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+    typedef typename OtherDerived::Scalar Scalar;
+    typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived>
+        Difference;
+    Difference difference(m_expression, other);
+    typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+    Assign assign(m_expression, difference);
+    internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+    return *this;
+  }
+
+ protected:
+  const DeviceType& m_device;
+  ExpressionType& m_expression;
+};
+
+/** \class TensorAsyncDevice
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Pseudo expression providing an operator = that will evaluate its
+ * argument asynchronously on the specified device. Currently only
+ * ThreadPoolDevice implements proper asynchronous execution, while the default
+ * and GPU devices just run the expression synchronously and call m_done() on
+ * completion..
+ *
+ * Example:
+ *    auto done = []() { ... expression evaluation done ... };
+ *    C.device(thread_pool_device, std::move(done)) = A + B;
+ */
+
+template <typename ExpressionType, typename DeviceType, typename DoneCallback>
+class TensorAsyncDevice {
+ public:
+  TensorAsyncDevice(const DeviceType& device, ExpressionType& expression, DoneCallback done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorExecutor<const Assign, DeviceType> Executor;
+
+    Assign assign(m_expression, other);
+    Executor::run(assign, m_device);
+    m_done();
+
+    return *this;
+  }
+
+ protected:
+  const DeviceType& m_device;
+  ExpressionType& m_expression;
+  DoneCallback m_done;
+};
+
+#ifdef EIGEN_USE_THREADS
+template <typename ExpressionType, typename DoneCallback>
+class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> {
+ public:
+  TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression, DoneCallback done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor;
+
+    // WARNING: After assignment 'm_done' callback will be in undefined state.
+    Assign assign(m_expression, other);
+    Executor::runAsync(assign, m_device, std::move(m_done));
+
+    return *this;
+  }
+
+ protected:
+  const ThreadPoolDevice& m_device;
+  ExpressionType& m_expression;
+  DoneCallback m_done;
+};
+#endif
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
new file mode 100644
index 00000000000..c2c8ed002bf
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -0,0 +1,7 @@
+
+#if defined(__clang__) || defined(__GNUC__)
+#warning \
+    "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
+#endif
+
+#include "TensorDeviceGpu.h"
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
new file mode 100644
index 00000000000..eaaf3321500
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -0,0 +1,113 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return allocate(num_bytes); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); }
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+    // std::fill is not a device function, so resort to simple loop.
+    for (T* it = begin; it != end; ++it) {
+      *it = value;
+    }
+#else
+    std::fill(begin, end, value);
+#endif
+  }
+  template <typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+    return data;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+    // Running on the host CPU
+    return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 64;
+#else
+    // Running on a CUDA device
+    return 32;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+    // Running on the host CPU
+    return l1CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 48 * 1024;  // FIXME : update this number for HIP
+#else
+    // Running on a CUDA device, return the amount of shared memory available.
+    return 48 * 1024;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+    // Running single threaded on the host CPU
+    return l3CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return firstLevelCacheSize();  // FIXME : update this number for HIP
+#else
+    // Running on a CUDA device
+    return firstLevelCacheSize();
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+    // Nothing.  Default device operations are synchronous.
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+    // Running single threaded on the host CPU
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    // return 1 as major for HIP
+    return 1;
+#else
+    // Running on a CUDA device
+    return EIGEN_CUDA_ARCH / 100;
+#endif
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
new file mode 100644
index 00000000000..4c24bc1f0a9
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -0,0 +1,395 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
+
+// This header file container defines fo gpu* macros which will resolve to
+// their equivalent hip* or cuda* versions depending on the compiler in use
+// A separate header (included at the end of this file) will undefine all
+#include "TensorGpuHipCudaDefines.h"
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+static const int kGpuScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// HIP / CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const gpuStream_t& stream() const = 0;
+  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
+};
+
+class GpuDeviceProperties {
+ public:
+  GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {}
+
+  ~GpuDeviceProperties() {
+    if (device_properties_) {
+      delete[] device_properties_;
+    }
+  }
+
+  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { return device_properties_[device]; }
+
+  EIGEN_STRONG_INLINE bool isInitialized() const { return initialized_; }
+
+  void initialize() {
+    if (!initialized_) {
+      // Attempts to ensure proper behavior in the case of multiple threads
+      // calling this function simultaneously. This would be trivial to
+      // implement if we could use std::mutex, but unfortunately mutex don't
+      // compile with nvcc, so we resort to atomics and thread fences instead.
+      // Note that if the caller uses a compiler that doesn't support c++11 we
+      // can't ensure that the initialization is thread safe.
+      if (first_.exchange(false)) {
+        // We're the first thread to reach this point.
+        int num_devices;
+        gpuError_t status = gpuGetDeviceCount(&num_devices);
+        if (status != gpuSuccess) {
+          std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
+          gpu_assert(status == gpuSuccess);
+        }
+        device_properties_ = new gpuDeviceProp_t[num_devices];
+        for (int i = 0; i < num_devices; ++i) {
+          status = gpuGetDeviceProperties(&device_properties_[i], i);
+          if (status != gpuSuccess) {
+            std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl;
+            gpu_assert(status == gpuSuccess);
+          }
+        }
+
+        std::atomic_thread_fence(std::memory_order_release);
+        initialized_ = true;
+      } else {
+        // Wait for the other thread to inititialize the properties.
+        while (!initialized_) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        }
+      }
+    }
+  }
+
+ private:
+  volatile bool initialized_;
+  std::atomic<bool> first_;
+  gpuDeviceProp_t* device_properties_;
+};
+
+EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
+  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
+  if (!deviceProperties->isInitialized()) {
+    deviceProperties->initialize();
+  }
+  return *deviceProperties;
+}
+
+EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
+  return GetGpuDeviceProperties().get(device);
+}
+
+static const gpuStream_t default_stream = gpuStreamDefault;
+
+class GpuStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+    gpuError_t status = gpuGetDevice(&device_);
+    if (status != gpuSuccess) {
+      std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
+      gpu_assert(status == gpuSuccess);
+    }
+  }
+  // Use the default stream on the specified device
+  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    if (device < 0) {
+      gpuError_t status = gpuGetDevice(&device_);
+      if (status != gpuSuccess) {
+        std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
+        gpu_assert(status == gpuSuccess);
+      }
+    } else {
+      int num_devices;
+      gpuError_t err = gpuGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+      gpu_assert(device < num_devices);
+      device_ = device;
+    }
+  }
+
+  virtual ~GpuStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
+  const gpuStream_t& stream() const { return *stream_; }
+  const gpuDeviceProp_t& deviceProperties() const { return GetGpuDeviceProperties(device_); }
+  virtual void* allocate(size_t num_bytes) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    void* result;
+    err = gpuMalloc(&result, num_bytes);
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(buffer != NULL);
+    err = gpuFree(buffer);
+    gpu_assert(err == gpuSuccess);
+  }
+
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+    }
+    return semaphore_;
+  }
+
+ private:
+  const gpuStream_t* stream_;
+  int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { eigen_assert(stream); }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+    eigen_assert(stream);
+  }
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const gpuStream_t& stream() const { return stream_->stream(); }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return stream_->allocate(num_bytes); }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const { stream_->deallocate(buffer); }
+
+  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return stream_->allocate(num_bytes); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { stream_->deallocate(buffer); }
+
+  template <typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const { return stream_->semaphore(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n);
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+    EIGEN_UNUSED_VARIABLE(buffer)
+    EIGEN_UNUSED_VARIABLE(c)
+    EIGEN_UNUSED_VARIABLE(n)
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  template <typename T>
+  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    const size_t count = end - begin;
+    // Split value into bytes and run memset with stride.
+    const int value_size = sizeof(value);
+    char* buffer = (char*)begin;
+    char* value_bytes = (char*)(&value);
+    gpuError_t err;
+    EIGEN_UNUSED_VARIABLE(err)
+
+    // If all value bytes are equal, then a single memset can be much faster.
+    bool use_single_memset = true;
+    for (int i = 1; i < value_size; ++i) {
+      if (value_bytes[i] != value_bytes[0]) {
+        use_single_memset = false;
+      }
+    }
+
+    if (use_single_memset) {
+      err = gpuMemsetAsync(buffer, value_bytes[0], count * sizeof(T), stream_->stream());
+      gpu_assert(err == gpuSuccess);
+    } else {
+      for (int b = 0; b < value_size; ++b) {
+        err = gpuMemset2DAsync(buffer + b, value_size, value_bytes[b], 1, count, stream_->stream());
+        gpu_assert(err == gpuSuccess);
+      }
+    }
+#else
+    EIGEN_UNUSED_VARIABLE(begin)
+    EIGEN_UNUSED_VARIABLE(end)
+    EIGEN_UNUSED_VARIABLE(value)
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48 * 1024;
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on hip/cuda devices.
+    return firstLevelCacheSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuStreamSynchronize(stream_->stream());
+    if (err != gpuSuccess) {
+      std::cerr << "Error detected in GPU stream: " << gpuGetErrorString(err) << std::endl;
+      gpu_assert(err == gpuSuccess);
+    }
+#else
+    gpu_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { return stream_->deviceProperties().multiProcessorCount; }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { return stream_->deviceProperties().maxThreadsPerBlock; }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+    return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
+  }
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; }
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const { return stream_->deviceProperties().minor; }
+
+  EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; }
+
+  // This function checks if the GPU runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+#ifdef EIGEN_GPUCC
+    gpuError_t error = gpuStreamQuery(stream_->stream());
+    return (error == gpuSuccess) || (error == gpuErrorNotReady);
+#else
+    return false;
+#endif
+  }
+
+ private:
+  const StreamInterface* stream_;
+  int max_blocks_;
+};
+
+#if defined(EIGEN_HIPCC)
+
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)                              \
+  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
+  gpu_assert(hipGetLastError() == hipSuccess);
+
+#else
+
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)        \
+  (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>(__VA_ARGS__); \
+  gpu_assert(cudaGetLastError() == cudaSuccess);
+
+#endif
+
+// FIXME: Should be device and kernel specific.
+#ifdef EIGEN_GPUCC
+static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status)
+  gpu_assert(status == gpuSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
+#endif
+
+}  // end namespace Eigen
+
+// undefine all the gpu* macros we defined at the beginning of the file
+#include "TensorGpuHipCudaUndefines.h"
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
new file mode 100644
index 00000000000..b291fed66a5
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -0,0 +1,567 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
+#include <unordered_set>
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace TensorSycl {
+namespace internal {
+
+/// Cache all the device information needed
+struct SyclDeviceInfo {
+  SyclDeviceInfo(cl::sycl::queue queue)
+      : local_mem_type(queue.get_device().template get_info<cl::sycl::info::device::local_mem_type>()),
+        max_work_item_sizes(queue.get_device().template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
+        max_mem_alloc_size(queue.get_device().template get_info<cl::sycl::info::device::max_mem_alloc_size>()),
+        max_compute_units(queue.get_device().template get_info<cl::sycl::info::device::max_compute_units>()),
+        max_work_group_size(queue.get_device().template get_info<cl::sycl::info::device::max_work_group_size>()),
+        local_mem_size(queue.get_device().template get_info<cl::sycl::info::device::local_mem_size>()),
+        platform_name(queue.get_device().get_platform().template get_info<cl::sycl::info::platform::name>()),
+        device_name(queue.get_device().template get_info<cl::sycl::info::device::name>()),
+        device_vendor(queue.get_device().template get_info<cl::sycl::info::device::vendor>()) {}
+
+  cl::sycl::info::local_mem_type local_mem_type;
+  cl::sycl::id<3> max_work_item_sizes;
+  unsigned long max_mem_alloc_size;
+  unsigned long max_compute_units;
+  unsigned long max_work_group_size;
+  size_t local_mem_size;
+  std::string platform_name;
+  std::string device_name;
+  std::string device_vendor;
+};
+
+}  // end namespace internal
+}  // end namespace TensorSycl
+
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
+EIGEN_STRONG_INLINE auto get_sycl_supported_devices() -> decltype(cl::sycl::device::get_devices()) {
+#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
+  return {cl::sycl::device(cl::sycl::default_selector())};
+#else
+  std::vector<cl::sycl::device> supported_devices;
+  auto platform_list = cl::sycl::platform::get_platforms();
+  for (const auto &platform : platform_list) {
+    auto device_list = platform.get_devices();
+    auto platform_name = platform.template get_info<cl::sycl::info::platform::name>();
+    std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower);
+    for (const auto &device : device_list) {
+      auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
+      std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
+      bool unsupported_condition = (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
+                                    vendor.find("apu") == std::string::npos) ||
+                                   (platform_name.find("experimental") != std::string::npos) || device.is_host();
+      if (!unsupported_condition) {
+        supported_devices.push_back(device);
+      }
+    }
+  }
+  return supported_devices;
+#endif
+}
+
+class QueueInterface {
+ public:
+  /// Creating device by using cl::sycl::selector or cl::sycl::device.
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
+                          unsigned num_threads = std::thread::hardware_concurrency())
+      : m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}},
+        m_thread_pool(num_threads),
+        m_device_info(m_queue) {}
+
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(const DeviceOrSelector &dev_or_sel,
+                          unsigned num_threads = std::thread::hardware_concurrency())
+      : QueueInterface(
+            dev_or_sel, [this](cl::sycl::exception_list l) { this->exception_caught_ = this->sycl_async_handler(l); },
+            num_threads) {}
+
+  explicit QueueInterface(const cl::sycl::queue &q, unsigned num_threads = std::thread::hardware_concurrency())
+      : m_queue(q), m_thread_pool(num_threads), m_device_info(m_queue) {}
+
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    return (void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES, num_bytes, m_queue);
+#else
+    return (void *)cl::sycl::malloc_device(num_bytes, m_queue);
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+    return (void *)cl::sycl::malloc_device<uint8_t>(num_bytes, m_queue);
+  }
+
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { deallocate(p); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(const void *p) const { deallocate_temp(const_cast<void *>(p)); }
+
+  EIGEN_STRONG_INLINE void deallocate(void *p) const { cl::sycl::free(p, m_queue); }
+
+  /// The memcpyHostToDevice is used to copy the data from host to device
+  /// The destination pointer could be deleted before the copy happened which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void *dst, const void *src, size_t n,
+                                              std::function<void()> callback) const {
+    auto e = m_queue.memcpy(dst, src, n);
+    synchronize_and_callback(e, callback);
+  }
+
+  /// The memcpyDeviceToHost is used to copy the data from device to host.
+  /// The source pointer could be deleted before the copy happened which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const void *src, size_t n,
+                                              std::function<void()> callback) const {
+    if (n == 0) {
+      if (callback) callback();
+      return;
+    }
+    auto e = m_queue.memcpy(dst, src, n);
+    synchronize_and_callback(e, callback);
+  }
+
+  /// The memcpy function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+    if (n == 0) {
+      return;
+    }
+    m_queue.memcpy(dst, src, n).wait();
+  }
+
+  /// the memset function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+    if (n == 0) {
+      return;
+    }
+    m_queue.memset(data, c, n).wait();
+  }
+
+  template <typename T>
+  EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
+    if (begin == end) {
+      return;
+    }
+    const size_t count = end - begin;
+    m_queue.fill(begin, value, count).wait();
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename Lhs, typename Rhs, typename OutPtr, typename Range,
+            typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(const Lhs &lhs, const Rhs &rhs, OutPtr outptr,
+                                                             Range thread_range, Index scratchSize, T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+    };
+
+    return m_queue.submit(kernel_functor);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename InPtr, typename OutPtr, typename Range, typename Index,
+            typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(const InPtr &inptr, OutPtr &outptr, Range thread_range,
+                                                            Index scratchSize, T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+    };
+    return m_queue.submit(kernel_functor);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename InPtr, typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(const InPtr &inptr, Range thread_range, Index scratchSize,
+                                                              T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...));
+    };
+
+    return m_queue.submit(kernel_functor);
+  }
+
+  EIGEN_STRONG_INLINE void synchronize() const {
+#ifdef EIGEN_EXCEPTIONS
+    m_queue.wait_and_throw();
+#else
+    m_queue.wait();
+#endif
+  }
+
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
+    tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                        static_cast<Index>(tileSize));
+    rng = n;
+    if (rng == 0) rng = static_cast<Index>(1);
+    GRange = rng;
+    if (tileSize > GRange)
+      tileSize = GRange;
+    else if (GRange > tileSize) {
+      Index xMode = static_cast<Index>(GRange % tileSize);
+      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
+    }
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+                                              cl::sycl::range<2> &local_range) const {
+    std::array<Index, 2> input_range = input_dim;
+    Index max_workgroup_Size = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                                  static_cast<Index>(max_workgroup_Size));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    local_range[1] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0) global_range[1] += static_cast<Index>(local_range[1] - xMode);
+    }
+    local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0) global_range[0] += static_cast<Index>(local_range[0] - xMode);
+    }
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+                                              cl::sycl::range<3> &local_range) const {
+    std::array<Index, 3> input_range = input_dim;
+    Index max_workgroup_Size = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                                  static_cast<Index>(max_workgroup_Size));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    local_range[2] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
+    input_range[2] = input_dim[2];
+    if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[2] = input_range[2];
+    if (local_range[2] > global_range[2])
+      local_range[2] = global_range[2];
+    else if (global_range[2] > local_range[2]) {
+      Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
+      if (xMode != 0) global_range[2] += static_cast<Index>(local_range[2] - xMode);
+    }
+    pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
+    local_range[1] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0) global_range[1] += static_cast<Index>(local_range[1] - xMode);
+    }
+    local_range[0] = static_cast<Index>(max_workgroup_Size / (local_range[1] * local_range[2]));
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0) global_range[0] += static_cast<Index>(local_range[0] - xMode);
+    }
+  }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const {
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return false;
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return true;
+#else
+    return m_device_info.local_mem_type == cl::sycl::info::local_mem_type::local;
+#endif
+  }
+
+  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { return m_device_info.max_mem_alloc_size; }
+
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { return m_device_info.max_compute_units; }
+
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { return m_device_info.max_work_group_size; }
+
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { return m_device_info.max_work_item_sizes; }
+
+  /// No need for sycl it should act the same as CPU version
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
+
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL does not have such a concept
+    return 2;
+  }
+
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return m_device_info.local_mem_size; }
+
+  // This function returns the nearest power of 2 Work-group size which is <=
+  // maximum device workgroup size.
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return getPowerOfTwo(m_device_info.max_work_group_size, false);
+  }
+
+  EIGEN_STRONG_INLINE std::string getPlatformName() const { return m_device_info.platform_name; }
+
+  EIGEN_STRONG_INLINE std::string getDeviceName() const { return m_device_info.device_name; }
+
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return m_device_info.device_vendor; }
+
+  // This function returns the nearest power of 2
+  // if roundup is true returns result>=wgsize
+  // else it return result <= wgsize
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
+    if (roundUp) --wGSize;
+    wGSize |= (wGSize >> 1);
+    wGSize |= (wGSize >> 2);
+    wGSize |= (wGSize >> 4);
+    wGSize |= (wGSize >> 8);
+    wGSize |= (wGSize >> 16);
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
+    wGSize |= (wGSize >> 32);
+#endif
+    return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
+
+  // This function checks if the runtime recorded an error for the
+  // underlying stream device.
+  EIGEN_STRONG_INLINE bool ok() const {
+    if (!exception_caught_) {
+      synchronize();
+    }
+    return !exception_caught_;
+  }
+
+ protected:
+  void synchronize_and_callback(cl::sycl::event e, const std::function<void()> &callback) const {
+    if (callback) {
+      auto callback_ = [=]() {
+#ifdef EIGEN_EXCEPTIONS
+        cl::sycl::event(e).wait_and_throw();
+#else
+        cl::sycl::event(e).wait();
+#endif
+        callback();
+      };
+      m_thread_pool.Schedule(std::move(callback_));
+    } else {
+#ifdef EIGEN_EXCEPTIONS
+      m_queue.wait_and_throw();
+#else
+      m_queue.wait();
+#endif
+    }
+  }
+
+  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
+    bool exception_caught = false;
+    for (const auto &e : exceptions) {
+      if (e) {
+        exception_caught = true;
+        EIGEN_THROW_X(e);
+      }
+    }
+    return exception_caught;
+  }
+
+  /// class members:
+  bool exception_caught_ = false;
+  /// sycl queue
+  mutable cl::sycl::queue m_queue;
+  /// The thread pool is used to wait on events and call callbacks
+  /// asynchronously
+  mutable Eigen::ThreadPool m_thread_pool;
+
+  const TensorSycl::internal::SyclDeviceInfo m_device_info;
+};
+
+struct SyclDeviceBase {
+  /// QueueInterface is not owned. it is the caller's responsibility to destroy
+  /// it
+  const QueueInterface *m_queue_stream;
+  explicit SyclDeviceBase(const QueueInterface *queue_stream) : m_queue_stream(queue_stream) {}
+  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { return m_queue_stream; }
+};
+
+// Here is a sycl device struct which accept the sycl queue interface
+// as an input
+struct SyclDevice : public SyclDeviceBase {
+  explicit SyclDevice(const QueueInterface *queue_stream) : SyclDeviceBase(queue_stream) {}
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
+    queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+                                              cl::sycl::range<2> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+                                              cl::sycl::range<3> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+  }
+
+  /// allocate device memory
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return queue_stream()->allocate(num_bytes); }
+
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { return queue_stream()->allocate_temp(num_bytes); }
+
+  /// deallocate device memory
+  EIGEN_STRONG_INLINE void deallocate(void *p) const { queue_stream()->deallocate(p); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { queue_stream()->deallocate_temp(buffer); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(const void *buffer) const { queue_stream()->deallocate_temp(buffer); }
+
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
+    return data;
+  }
+
+  // some runtime conditions that can be applied here
+  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+
+  /// memcpyHostToDevice
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n,
+                                              std::function<void()> callback = {}) const {
+    queue_stream()->memcpyHostToDevice(dst, src, n, callback);
+  }
+  /// memcpyDeviceToHost
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n,
+                                              std::function<void()> callback = {}) const {
+    queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
+  }
+  /// the memcpy function
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+    queue_stream()->memcpy(dst, src, n);
+  }
+  /// the memset function
+  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { queue_stream()->memset(data, c, n); }
+  /// the fill function
+  template <typename T>
+  EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
+    queue_stream()->fill(begin, end, value);
+  }
+  /// returning the sycl queue
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return queue_stream()->sycl_queue(); }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on sycl devices.
+    return firstLevelCacheSize();
+  }
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+    return queue_stream()->getNumSyclMultiProcessors();
+  }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { return queue_stream()->maxSyclThreadsPerBlock(); }
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { return queue_stream()->maxWorkItemSizes(); }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL does not have such a concept
+    return queue_stream()->maxSyclThreadsPerMultiProcessor();
+  }
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return queue_stream()->sharedMemPerBlock(); }
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
+  }
+
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
+    return queue_stream()->getPowerOfTwo(val, roundUp);
+  }
+  /// No need for sycl it should act the same as CPU version
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return queue_stream()->majorDeviceVersion(); }
+
+  EIGEN_STRONG_INLINE void synchronize() const { queue_stream()->synchronize(); }
+
+  // This function checks if the runtime recorded an error for the
+  // underlying stream device.
+  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const { return queue_stream()->has_local_memory(); }
+  EIGEN_STRONG_INLINE long max_buffer_size() const { return queue_stream()->max_buffer_size(); }
+  EIGEN_STRONG_INLINE std::string getPlatformName() const { return queue_stream()->getPlatformName(); }
+  EIGEN_STRONG_INLINE std::string getDeviceName() const { return queue_stream()->getDeviceName(); }
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return queue_stream()->getDeviceVendor(); }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(T... var) const {
+    return queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(var...);
+  }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(T... var) const {
+    return queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(var...);
+  }
+
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(T... var) const {
+    return queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(var...);
+  }
+};
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
new file mode 100644
index 00000000000..c95c8f22343
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -0,0 +1,376 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// Runs an arbitrary function and then calls Notify() on the passed in
+// Notification.
+template <typename Function, typename... Args>
+struct FunctionWrapperWithNotification {
+  static void run(Notification* n, Function f, Args... args) {
+    f(args...);
+    if (n) {
+      n->Notify();
+    }
+  }
+};
+
+template <typename Function, typename... Args>
+struct FunctionWrapperWithBarrier {
+  static void run(Barrier* b, Function f, Args... args) {
+    f(args...);
+    if (b) {
+      b->Notify();
+    }
+  }
+};
+
+template <typename SyncType>
+static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
+  if (n) {
+    n->Wait();
+  }
+}
+
+// An abstract interface to a device specific memory allocator.
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+};
+
+// Build a thread pool device on top the an existing pool of threads.
+struct ThreadPoolDevice {
+  // The ownership of the thread pool remains with the caller.
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
+      : pool_(pool), num_threads_(num_cores), allocator_(allocator) {}
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return allocator_ ? allocator_->allocate(num_bytes) : internal::aligned_malloc(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    if (allocator_) {
+      allocator_->deallocate(buffer);
+    } else {
+      internal::aligned_free(buffer);
+    }
+  }
+
+  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return allocate(num_bytes); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); }
+
+  template <typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifdef __ANDROID__
+    ::memcpy(dst, src, n);
+#else
+    // TODO(rmlarsen): Align blocks on cache lines.
+    // We have observed that going beyond 4 threads usually just wastes
+    // CPU cycles due to the threads competing for memory bandwidth, so we
+    // statically schedule at most 4 block copies here.
+    const size_t kMinBlockSize = 32768;
+    const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
+    if (n <= kMinBlockSize || num_threads < 2) {
+      ::memcpy(dst, src, n);
+    } else {
+      const char* src_ptr = static_cast<const char*>(src);
+      char* dst_ptr = static_cast<char*>(dst);
+      const size_t blocksize = (n + (num_threads - 1)) / num_threads;
+      Barrier barrier(static_cast<int>(num_threads - 1));
+      // Launch the last 3 blocks on worker threads.
+      for (size_t i = 1; i < num_threads; ++i) {
+        enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
+          ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, numext::mini(blocksize, n - (i * blocksize)));
+        });
+      }
+      // Launch the first block on the main thread.
+      ::memcpy(dst_ptr, src_ptr, blocksize);
+      barrier.Wait();
+    }
+#endif
+  }
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); }
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); }
+
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); }
+
+  template <typename T>
+  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+    std::fill(begin, end, value);
+  }
+
+  EIGEN_STRONG_INLINE int numThreads() const { return num_threads_; }
+
+  // Number of theads available in the underlying thread pool. This number can
+  // be different from the value returned by numThreads().
+  EIGEN_STRONG_INLINE int numThreadsInPool() const { return pool_->NumThreads(); }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return l1CacheSize(); }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // The l3 cache size is shared between all the cores.
+    return l3CacheSize() / num_threads_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+    // Nothing.  Threadpool device operations are synchronous.
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
+    Notification* n = new Notification();
+    pool_->Schedule(
+        std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, std::forward<Function>(f), args...));
+    return n;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, Args&&... args) const {
+    pool_->Schedule(
+        std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b, std::forward<Function>(f), args...));
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
+    if (sizeof...(args) > 0) {
+      pool_->Schedule(std::bind(std::forward<Function>(f), args...));
+    } else {
+      pool_->Schedule(std::forward<Function>(f));
+    }
+  }
+
+  // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+  // called from one of the threads in pool_. Returns -1 otherwise.
+  EIGEN_STRONG_INLINE int currentThreadId() const { return pool_->CurrentThreadId(); }
+
+  // WARNING: This function is synchronous and will block the calling thread.
+  //
+  // Synchronous parallelFor executes f with [0, n) arguments in parallel and
+  // waits for completion. F accepts a half-open interval [first, last). Block
+  // size is chosen based on the iteration cost and resulting parallel
+  // efficiency. If block_align is not nullptr, it is called to round up the
+  // block size.
+  void parallelFor(Index n, const TensorOpCost& cost, std::function<Index(Index)> block_align,
+                   std::function<void(Index, Index)> f) const {
+    if (EIGEN_PREDICT_FALSE(n <= 0)) {
+      return;
+      // Compute small problems directly in the caller thread.
+    } else if (n == 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      return;
+    }
+
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    Barrier barrier(static_cast<unsigned int>(block.count));
+    std::function<void(Index, Index)> handleRange;
+    handleRange = [this, block, &handleRange, &barrier, &f](Index firstIdx, Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+      // Single block or less, execute directly.
+      f(firstIdx, lastIdx);
+      barrier.Notify();
+    };
+
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      handleRange(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
+    }
+
+    barrier.Wait();
+  }
+
+  // Convenience wrapper for parallelFor that does not align blocks.
+  void parallelFor(Index n, const TensorOpCost& cost, std::function<void(Index, Index)> f) const {
+    parallelFor(n, cost, nullptr, std::move(f));
+  }
+
+  // WARNING: This function is asynchronous and will not block the calling thread.
+  //
+  // Asynchronous parallelFor executes f with [0, n) arguments in parallel
+  // without waiting for completion. When the last block finished, it will call
+  // 'done' callback. F accepts a half-open interval [first, last). Block size
+  // is chosen based on the iteration cost and resulting parallel efficiency. If
+  // block_align is not nullptr, it is called to round up the block size.
+  void parallelForAsync(Index n, const TensorOpCost& cost, std::function<Index(Index)> block_align,
+                        std::function<void(Index, Index)> f, std::function<void()> done) const {
+    // Compute small problems directly in the caller thread.
+    if (n <= 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      done();
+      return;
+    }
+
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    ParallelForAsyncContext* const ctx = new ParallelForAsyncContext(block.count, std::move(f), std::move(done));
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule([ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+
+      // Single block or less, execute directly.
+      ctx->f(firstIdx, lastIdx);
+
+      // Delete async context if it was the last block.
+      if (ctx->count.fetch_sub(1) == 1) delete ctx;
+    };
+
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      ctx->handle_range(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
+    }
+  }
+
+  // Convenience wrapper for parallelForAsync that does not align blocks.
+  void parallelForAsync(Index n, const TensorOpCost& cost, std::function<void(Index, Index)> f,
+                        std::function<void()> done) const {
+    parallelForAsync(n, cost, nullptr, std::move(f), std::move(done));
+  }
+
+  // Thread pool accessor.
+  ThreadPoolInterface* getPool() const { return pool_; }
+
+  // Allocator accessor.
+  Allocator* allocator() const { return allocator_; }
+
+ private:
+  typedef TensorCostModel<ThreadPoolDevice> CostModel;
+
+  // For parallelForAsync we must keep passed in closures on the heap, and
+  // delete them only after `done` callback finished.
+  struct ParallelForAsyncContext {
+    ParallelForAsyncContext(Index block_count, std::function<void(Index, Index)> block_f,
+                            std::function<void()> done_callback)
+        : count(block_count), f(std::move(block_f)), done(std::move(done_callback)) {}
+    ~ParallelForAsyncContext() { done(); }
+
+    std::atomic<Index> count;
+    std::function<void(Index, Index)> f;
+    std::function<void()> done;
+
+    std::function<void(Index, Index)> handle_range;
+  };
+
+  struct ParallelForBlock {
+    Index size;   // block size
+    Index count;  // number of blocks
+  };
+
+  // Calculates block size based on (1) the iteration cost and (2) parallel
+  // efficiency. We want blocks to be not too small to mitigate parallelization
+  // overheads; not too large to mitigate tail effect and potential load
+  // imbalance and we also want number of blocks to be evenly dividable across
+  // threads.
+  ParallelForBlock CalculateParallelForBlock(const Index n, const TensorOpCost& cost,
+                                             std::function<Index(Index)> block_align) const {
+    const double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+    const Index max_oversharding_factor = 4;
+    Index block_size = numext::mini(
+        n, numext::maxi<Index>(numext::div_ceil<Index>(n, max_oversharding_factor * numThreads()), block_size_f));
+    const Index max_block_size = numext::mini(n, 2 * block_size);
+
+    if (block_align) {
+      Index new_block_size = block_align(block_size);
+      eigen_assert(new_block_size >= block_size);
+      block_size = numext::mini(n, new_block_size);
+    }
+
+    Index block_count = numext::div_ceil(n, block_size);
+
+    // Calculate parallel efficiency as fraction of total CPU time used for
+    // computations:
+    double max_efficiency =
+        static_cast<double>(block_count) / (numext::div_ceil<Index>(block_count, numThreads()) * numThreads());
+
+    // Now try to increase block size up to max_block_size as long as it
+    // doesn't decrease parallel efficiency.
+    for (Index prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) {
+      // This is the next block size that divides size into a smaller number
+      // of blocks than the current block_size.
+      Index coarser_block_size = numext::div_ceil(n, prev_block_count - 1);
+      if (block_align) {
+        Index new_block_size = block_align(coarser_block_size);
+        eigen_assert(new_block_size >= coarser_block_size);
+        coarser_block_size = numext::mini(n, new_block_size);
+      }
+      if (coarser_block_size > max_block_size) {
+        break;  // Reached max block size. Stop.
+      }
+      // Recalculate parallel efficiency.
+      const Index coarser_block_count = numext::div_ceil(n, coarser_block_size);
+      eigen_assert(coarser_block_count < prev_block_count);
+      prev_block_count = coarser_block_count;
+      const double coarser_efficiency = static_cast<double>(coarser_block_count) /
+                                        (numext::div_ceil<Index>(coarser_block_count, numThreads()) * numThreads());
+      if (coarser_efficiency + 0.01 >= max_efficiency) {
+        // Taking it.
+        block_size = coarser_block_size;
+        block_count = coarser_block_count;
+        if (max_efficiency < coarser_efficiency) {
+          max_efficiency = coarser_efficiency;
+        }
+      }
+    }
+
+    return {block_size, block_count};
+  }
+
+  ThreadPoolInterface* pool_;
+  int num_threads_;
+  Allocator* allocator_;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
new file mode 100644
index 00000000000..315b556aefc
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@@ -0,0 +1,119 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorDimensionList
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n.
+ *
+ * \sa Tensor
+ */
+
+template <typename Index, std::size_t Rank>
+struct DimensionList {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const Index operator[](const Index i) const { return i; }
+};
+
+namespace internal {
+
+template <typename Index, std::size_t Rank>
+struct array_size<DimensionList<Index, Rank> > {
+  static const size_t value = Rank;
+};
+template <typename Index, std::size_t Rank>
+struct array_size<const DimensionList<Index, Rank> > {
+  static const size_t value = Rank;
+};
+
+template <DenseIndex n, typename Index, std::size_t Rank>
+const Index array_get(DimensionList<Index, Rank>&) {
+  return n;
+}
+template <DenseIndex n, typename Index, std::size_t Rank>
+const Index array_get(const DimensionList<Index, Rank>&) {
+  return n;
+}
+
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { return true; }
+};
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { return true; }
+};
+
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() { return true; }
+};
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() { return true; }
+};
+
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() { return true; }
+};
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() { return true; }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<DimensionList<Index, Rank> > {
+  static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i == value; }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i == value; }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i != value; }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
+  static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i != value; }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i > value; }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i > value; }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i < value; }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i < value; }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
new file mode 100644
index 00000000000..06d4dfd30dd
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -0,0 +1,329 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorDimensions
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Set of classes used to encode and store the dimensions of a Tensor.
+ *
+ * The Sizes class encodes as part of the type the number of dimensions and the
+ * sizes corresponding to each dimension. It uses no storage space since it is
+ * entirely known at compile time.
+ * The DSizes class is its dynamic sibling: the number of dimensions is known
+ * at compile time but the sizes are set during execution.
+ *
+ * \sa Tensor
+ */
+
+// Boilerplate code
+namespace internal {
+
+template <std::ptrdiff_t n, typename Dimension>
+struct dget {
+  static const std::ptrdiff_t value = get<n, Dimension>::value;
+};
+
+template <typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper {
+  template <typename Dimensions>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+                                                         const Dimensions& dimensions) {
+    return array_get < RowMajor                             ? n - 1
+           : (NumIndices - n) > (indices) + dget < RowMajor ? n - 1
+                                                            : (NumIndices - n),
+           Dimensions > ::value * fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(
+                                      indices, dimensions);
+  }
+};
+
+template <typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> {
+  template <typename Dimensions>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&) {
+    return 0;
+  }
+};
+
+template <typename Index, std::ptrdiff_t n>
+struct fixed_size_tensor_index_extraction_helper {
+  template <typename Dimensions>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(const Index index, const Dimensions& dimensions) {
+    const Index mult = (index == n - 1) ? 1 : 0;
+    return array_get<n - 1>(dimensions) * mult +
+           fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
+  }
+};
+
+template <typename Index>
+struct fixed_size_tensor_index_extraction_helper<Index, 0> {
+  template <typename Dimensions>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(const Index, const Dimensions&) {
+    return 0;
+  }
+};
+
+}  // end namespace internal
+
+// Fixed size
+template <typename std::ptrdiff_t... Indices>
+struct Sizes {
+  typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
+  const Base t = Base();
+  static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
+  static const ptrdiff_t count = Base::count;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { return Base::count; }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() { return internal::arg_prod(Indices...); }
+
+  EIGEN_DEVICE_FUNC Sizes() {}
+  template <typename DenseIndex>
+  explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+    // todo: add assertion
+  }
+  template <typename... DenseIndex>
+  EIGEN_DEVICE_FUNC Sizes(DenseIndex...) {}
+  explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
+    // todo: add assertion
+  }
+
+  template <typename T>
+  Sizes& operator=(const T& /*other*/) {
+    // add assertion failure if the size of other is different
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[](const std::ptrdiff_t index) const {
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
+  }
+
+  template <typename DenseIndex>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(
+        indices, t);
+  }
+  template <typename DenseIndex>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(
+        indices, t);
+  }
+};
+
+namespace internal {
+template <typename std::ptrdiff_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
+  return Sizes<Indices...>::total_size;
+}
+}  // namespace internal
+
+// Boilerplate
+namespace internal {
+template <typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+struct tensor_index_linearization_helper {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+                                                         array<Index, NumIndices> const& dimensions) {
+    return array_get < RowMajor ? n
+           : (NumIndices - n - 1) > (indices) + array_get < RowMajor
+               ? n
+               : (NumIndices - n - 1) >
+                     (dimensions)*tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(
+                         indices, dimensions);
+  }
+};
+
+template <typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+                                                         array<Index, NumIndices> const&) {
+    return array_get < RowMajor ? 0 : NumIndices - 1 > (indices);
+  }
+};
+}  // end namespace internal
+
+// Dynamic size
+template <typename DenseIndex, int NumDims>
+struct DSizes : array<DenseIndex, NumDims> {
+  typedef array<DenseIndex, NumDims> Base;
+  static const int count = NumDims;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumDims; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const {
+    return (NumDims == 0) ? 1 : internal::array_prod(*static_cast<const Base*>(this));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = 0;
+    }
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) {}
+
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
+    eigen_assert(NumDims == 1);
+    (*this)[0] = i0;
+  }
+
+  EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+
+  // Enable DSizes index type promotion only if we are promoting to the
+  // larger type, e.g. allow to promote dimensions of type int to long.
+  template <typename OtherIndex>
+  EIGEN_DEVICE_FUNC explicit DSizes(
+      const array<OtherIndex, NumDims>& other,
+      // Default template parameters require c++11.
+      std::enable_if_t<
+          internal::is_same<DenseIndex, typename internal::promote_index_type<DenseIndex, OtherIndex>::type>::value,
+          void*> = 0) {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = static_cast<DenseIndex>(other[i]);
+    }
+  }
+
+  template <typename FirstType, typename... OtherTypes>
+  EIGEN_DEVICE_FUNC explicit DSizes(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+    for (int i = 0; i < dimensions.count; ++i) {
+      (*this)[i] = dimensions[i];
+    }
+  }
+
+  template <typename std::ptrdiff_t... Indices>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension,
+                                                        IndexTypes... otherDimensions)
+      : Base({{firstDimension, secondDimension, otherDimensions...}}) {
+    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+
+  EIGEN_DEVICE_FUNC DSizes& operator=(const array<DenseIndex, NumDims>& other) {
+    *static_cast<Base*>(this) = other;
+    return *this;
+  }
+
+  // A constexpr would be so much better here
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(
+        indices, *static_cast<const Base*>(this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(
+        indices, *static_cast<const Base*>(this));
+  }
+};
+
+template <typename IndexType, int NumDims>
+std::ostream& operator<<(std::ostream& os, const DSizes<IndexType, NumDims>& dims) {
+  os << "[";
+  for (int i = 0; i < NumDims; ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}
+
+// Boilerplate
+namespace internal {
+template <typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+struct tensor_vsize_index_linearization_helper {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+                                                         std::vector<DenseIndex> const& dimensions) {
+    return array_get < RowMajor ? n
+           : (NumIndices - n - 1) > (indices) + array_get < RowMajor
+               ? n
+               : (NumIndices - n - 1) >
+                     (dimensions)*tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(
+                         indices, dimensions);
+  }
+};
+
+template <typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+                                                         std::vector<DenseIndex> const&) {
+    return array_get < RowMajor ? 0 : NumIndices - 1 > (indices);
+  }
+};
+}  // end namespace internal
+
+namespace internal {
+
+template <typename DenseIndex, int NumDims>
+struct array_size<const DSizes<DenseIndex, NumDims> > {
+  static const ptrdiff_t value = NumDims;
+};
+template <typename DenseIndex, int NumDims>
+struct array_size<DSizes<DenseIndex, NumDims> > {
+  static const ptrdiff_t value = NumDims;
+};
+template <typename std::ptrdiff_t... Indices>
+struct array_size<const Sizes<Indices...> > {
+  static const std::ptrdiff_t value = Sizes<Indices...>::count;
+};
+template <typename std::ptrdiff_t... Indices>
+struct array_size<Sizes<Indices...> > {
+  static const std::ptrdiff_t value = Sizes<Indices...>::count;
+};
+template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
+  return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value;
+}
+template <std::ptrdiff_t n>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
+  eigen_assert(false && "should never be called");
+  return -1;
+}
+
+template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m>
+struct sizes_match_below_dim {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return false; }
+};
+template <typename Dims1, typename Dims2, ptrdiff_t n>
+struct sizes_match_below_dim<Dims1, Dims2, n, n> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
+    return numext::equal_strict(array_get<n - 1>(dims1), array_get<n - 1>(dims2)) &&
+           sizes_match_below_dim<Dims1, Dims2, n - 1, n - 1>::run(dims1, dims2);
+  }
+};
+template <typename Dims1, typename Dims2>
+struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return true; }
+};
+
+}  // end namespace internal
+
+template <typename Dims1, typename Dims2>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
+  return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value,
+                                         internal::array_size<Dims2>::value>::run(dims1, dims2);
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
new file mode 100644
index 00000000000..8f1e4c4ba0b
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -0,0 +1,196 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ *
+ *
+ */
+namespace internal {
+template <typename XprType, template <class> class MakePointer_>
+struct traits<TensorEvalToOp<XprType, MakePointer_> > {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename MakePointer_<Scalar>::Type PointerType;
+
+  enum { Flags = 0 };
+  template <class T>
+  struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+};
+
+template <typename XprType, template <class> class MakePointer_>
+struct eval<TensorEvalToOp<XprType, MakePointer_>, Eigen::Dense> {
+  typedef const TensorEvalToOp<XprType, MakePointer_>& type;
+};
+
+template <typename XprType, template <class> class MakePointer_>
+struct nested<TensorEvalToOp<XprType, MakePointer_>, 1, typename eval<TensorEvalToOp<XprType, MakePointer_> >::type> {
+  typedef TensorEvalToOp<XprType, MakePointer_> type;
+};
+
+}  // end namespace internal
+
+template <typename XprType, template <class> class MakePointer_>
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename MakePointer_<CoeffReturnType>::Type PointerType;
+  typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
+
+  static constexpr int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
+      : m_xpr(expr), m_buffer(buffer) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  PointerType m_buffer;
+};
+
+template <typename ArgType, typename Device, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> {
+  typedef TensorEvalToOp<ArgType, MakePointer_> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef typename XprType::Index Index;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = true,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = true
+  };
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  static constexpr int NumDims = internal::traits<ArgType>::NumDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  typedef internal::TensorBlockAssignment<CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index>
+      TensorBlockAssignment;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()) {}
+
+  EIGEN_STRONG_INLINE ~TensorEvaluator() {}
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
+    EIGEN_UNUSED_VARIABLE(scalar);
+    eigen_assert(scalar == NULL);
+    return m_impl.evalSubExprsIfNeeded(m_buffer);
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType scalar, EvalSubExprsCallback done) {
+    EIGEN_UNUSED_VARIABLE(scalar);
+    eigen_assert(scalar == NULL);
+    m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done));
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const { m_buffer[i] = m_impl.coeff(i); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
+    internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(
+        m_buffer + i, m_impl.template packet < TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned > (i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return m_impl.getResourceRequirements();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    // Add `m_buffer` as destination buffer to the block descriptor.
+    desc.template AddDestinationBuffer<Layout>(
+        /*dst_base=*/m_buffer + desc.offset(),
+        /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
+
+    ArgTensorBlock block = m_impl.block(desc, scratch, /*root_of_expr_ast=*/true);
+
+    // If block was evaluated into a destination buffer, there is no need to do
+    // an assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()), m_buffer,
+                                        desc.offset()),
+          block.expr());
+    }
+    block.cleanup();
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // We assume that evalPacket or evalScalar is called to perform the
+    // assignment and account for the cost of the write here.
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
+  ArgType expression() const { return m_expression; }
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  EvaluatorPointerType m_buffer;
+  const ArgType m_expression;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
new file mode 100644
index 00000000000..d864fb46402
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -0,0 +1,859 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor evaluator classes.
+ *
+ * These classes are responsible for the evaluation of the tensor expression.
+ *
+ * TODO: add support for more types of expressions, in particular expressions
+ * leading to lvalues (slicing, reshaping, etc...)
+ */
+
+// Generic evaluator
+template <typename Derived, typename Device>
+struct TensorEvaluator {
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+  typedef Derived XprType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  // NumDimensions is -1 for variable dim tensors
+  static constexpr int NumCoords =
+      internal::traits<Derived>::NumDimensions > 0 ? internal::traits<Derived>::NumDimensions : 0;
+  static constexpr int Layout = Derived::Layout;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = internal::is_arithmetic<std::remove_const_t<Scalar>>::value,
+    PreferBlockAccess = false,
+    CoordAccess = NumCoords > 0,
+    RawAccess = true
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(device.get((const_cast<TensorPointerType>(m.data())))), m_dims(m.dimensions()), m_device(device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
+    if (!NumTraits<std::remove_const_t<Scalar>>::RequireInitialization && dest) {
+      m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
+      return false;
+    }
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data != NULL);
+    return m_data[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    eigen_assert(m_data != NULL);
+    return m_data[index];
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  // Return a packet starting at `index` where `umask` specifies which elements
+  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+  // float element will be loaded, otherwise 0 will be loaded.
+  // Function has been templatized to enable Sfinae.
+  template <typename PacketReturnTypeT>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      std::enable_if_t<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>
+      partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const {
+    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data != NULL);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return m_data[m_dims.IndexOfColMajor(coords)];
+    } else {
+      return m_data[m_dims.IndexOfRowMajor(coords)];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data != NULL);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return m_data[m_dims.IndexOfColMajor(coords)];
+    } else {
+      return m_data[m_dims.IndexOfRowMajor(coords)];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType<CoeffReturnType, Device>::size);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_data != NULL);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+  }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) {
+    eigen_assert(m_data != NULL);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr, Index> TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(desc.dimensions(), internal::strides<Layout>(m_dims), m_data, desc.offset()),
+        block.expr());
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+
+ protected:
+  EvaluatorPointerType m_data;
+  Dimensions m_dims;
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+namespace internal {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) {
+  return *address;
+}
+// Use the texture cache on CUDA devices whenever possible
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) {
+  return __ldg(address);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double loadConstant(const double* address) {
+  return __ldg(address);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half loadConstant(const Eigen::half* address) {
+  return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
+}
+#endif
+
+}  // namespace internal
+
+// Default evaluator for rvalues
+template <typename Derived, typename Device>
+struct TensorEvaluator<const Derived, Device> {
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+  typedef const Derived XprType;
+  typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
+  typedef StorageMemory<const Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  // NumDimensions is -1 for variable dim tensors
+  static constexpr int NumCoords =
+      internal::traits<Derived>::NumDimensions > 0 ? internal::traits<Derived>::NumDimensions : 0;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static constexpr int Layout = Derived::Layout;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = internal::is_arithmetic<ScalarNoConst>::value,
+    PreferBlockAccess = false,
+    CoordAccess = NumCoords > 0,
+    RawAccess = true
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    if (!NumTraits<std::remove_const_t<Scalar>>::RequireInitialization && data) {
+      m_device.memcpy((void*)(m_device.get(data)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
+      return false;
+    }
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data != NULL);
+    return internal::loadConstant(m_data + index);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  // Return a packet starting at `index` where `umask` specifies which elements
+  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+  // float element will be loaded, otherwise 0 will be loaded.
+  // Function has been templatized to enable Sfinae.
+  template <typename PacketReturnTypeT>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      std::enable_if_t<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>
+      partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const {
+    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data != NULL);
+    const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
+                                                                                 : m_dims.IndexOfRowMajor(coords);
+    return internal::loadConstant(m_data + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType<CoeffReturnType, Device>::size);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_data != NULL);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+
+ protected:
+  EvaluatorPointerType m_data;
+  Dimensions m_dims;
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+// -------------------- CwiseNullaryOp --------------------
+
+template <typename NullaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> {
+  typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = true,
+    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
+#ifdef EIGEN_USE_SYCL
+                   && (PacketType<CoeffReturnType, Device>::size > 1)
+#endif
+        ,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    done(true);
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_wrapper(m_functor, index); }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType<CoeffReturnType, Device>::size);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  const NullaryOp m_functor;
+  TensorEvaluator<ArgType, Device> m_argImpl;
+  const internal::nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
+};
+
+// -------------------- CwiseUnaryOp --------------------
+
+template <typename UnaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> {
+  typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess =
+        int(TensorEvaluator<ArgType, Device>::PacketAccess) & int(internal::functor_traits<UnaryOp>::PacketAccess),
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device), m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static constexpr int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_argImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+    return m_argImpl.costPerCoeff(vectorized) + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+    return m_argImpl.getResourceRequirements().addCostPerCoeff({0, 0, functor_cost / PacketSize});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    return TensorBlock(m_argImpl.block(desc, scratch), m_functor);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  const Device EIGEN_DEVICE_REF m_device;
+  const UnaryOp m_functor;
+  TensorEvaluator<ArgType, Device> m_argImpl;
+};
+
+// -------------------- CwiseBinaryOp --------------------
+
+template <typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device> {
+  typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+  enum {
+    IsAligned =
+        int(TensorEvaluator<LeftArgType, Device>::IsAligned) & int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                   int(TensorEvaluator<RightArgType, Device>::PacketAccess) &
+                   int(internal::functor_traits<BinaryOp>::PacketAccess),
+    BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                  int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device),
+        m_functor(op.functor()),
+        m_leftImpl(op.lhsExpression(), device),
+        m_rightImpl(op.rhsExpression(), device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+                             static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) ||
+                         internal::traits<XprType>::NumDimensions <= 1),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock LeftTensorBlock;
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock RightTensorBlock;
+
+  typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock, RightTensorBlock> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
+    // TODO: use right impl instead if right impl dimensions are known at compile time.
+    return m_leftImpl.dimensions();
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): Evaluate two expression in parallel?
+    m_leftImpl.evalSubExprsIfNeededAsync(
+        nullptr, [this, done](bool) { m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
+    return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
+  }
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index),
+                              m_rightImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+    return m_leftImpl.costPerCoeff(vectorized) + m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+    return internal::TensorBlockResourceRequirements::merge(m_leftImpl.getResourceRequirements(),
+                                                            m_rightImpl.getResourceRequirements())
+        .addCostPerCoeff({0, 0, functor_cost / PacketSize});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    desc.DropDestinationBuffer();
+    return TensorBlock(m_leftImpl.block(desc, scratch), m_rightImpl.block(desc, scratch), m_functor);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  const Device EIGEN_DEVICE_REF m_device;
+  const BinaryOp m_functor;
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+// -------------------- CwiseTernaryOp --------------------
+
+template <typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
+struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device> {
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
+
+  static constexpr int Layout = TensorEvaluator<Arg1Type, Device>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned &
+                TensorEvaluator<Arg3Type, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess && TensorEvaluator<Arg2Type, Device>::PacketAccess &&
+                   TensorEvaluator<Arg3Type, Device>::PacketAccess && internal::functor_traits<TernaryOp>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
+                        TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
+                        TensorEvaluator<Arg3Type, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_functor(op.functor()),
+        m_arg1Impl(op.arg1Expression(), device),
+        m_arg2Impl(op.arg2Expression(), device),
+        m_arg3Impl(op.arg3Expression(), device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) ==
+                             static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) ||
+                         internal::traits<XprType>::NumDimensions <= 1),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                                           typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                                           typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                                           typename internal::traits<Arg2Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                                           typename internal::traits<Arg3Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+
+    eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) &&
+                 dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
+    // TODO: use arg2 or arg3 dimensions if they are known at compile time.
+    return m_arg1Impl.dimensions();
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_arg1Impl.evalSubExprsIfNeeded(NULL);
+    m_arg2Impl.evalSubExprsIfNeeded(NULL);
+    m_arg3Impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_arg1Impl.cleanup();
+    m_arg2Impl.cleanup();
+    m_arg3Impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
+    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+  }
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index), m_arg2Impl.template packet<LoadMode>(index),
+                              m_arg3Impl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
+    return m_arg1Impl.costPerCoeff(vectorized) + m_arg2Impl.costPerCoeff(vectorized) +
+           m_arg3Impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  const TernaryOp m_functor;
+  TensorEvaluator<Arg1Type, Device> m_arg1Impl;
+  TensorEvaluator<Arg2Type, Device> m_arg2Impl;
+  TensorEvaluator<Arg3Type, Device> m_arg3Impl;
+};
+
+// -------------------- SelectOp --------------------
+
+template <typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
+struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device> {
+  typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  using TernarySelectOp = internal::scalar_boolean_select_op<typename internal::traits<ThenArgType>::Scalar,
+                                                             typename internal::traits<ElseArgType>::Scalar,
+                                                             typename internal::traits<IfArgType>::Scalar>;
+  static constexpr bool TernaryPacketAccess =
+      TensorEvaluator<ThenArgType, Device>::PacketAccess && TensorEvaluator<ElseArgType, Device>::PacketAccess &&
+      TensorEvaluator<IfArgType, Device>::PacketAccess && internal::functor_traits<TernarySelectOp>::PacketAccess;
+
+  static constexpr int Layout = TensorEvaluator<IfArgType, Device>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess = (TensorEvaluator<ThenArgType, Device>::PacketAccess &&
+                    TensorEvaluator<ElseArgType, Device>::PacketAccess && PacketType<Scalar, Device>::HasBlend) ||
+                   TernaryPacketAccess,
+    BlockAccess = TensorEvaluator<IfArgType, Device>::BlockAccess &&
+                  TensorEvaluator<ThenArgType, Device>::BlockAccess &&
+                  TensorEvaluator<ElseArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_condImpl(op.ifExpression(), device),
+        m_thenImpl(op.thenExpression(), device),
+        m_elseImpl(op.elseExpression(), device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
+    eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock IfArgTensorBlock;
+  typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock ThenArgTensorBlock;
+  typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock ElseArgTensorBlock;
+
+  struct TensorSelectOpBlockFactory {
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    struct XprType {
+      typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
+    };
+
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(const IfArgXprType& if_expr,
+                                                                              const ThenArgXprType& then_expr,
+                                                                              const ElseArgXprType& else_expr) const {
+      return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
+    }
+  };
+
+  typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory, IfArgTensorBlock, ThenArgTensorBlock,
+                                           ElseArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
+    // TODO: use then or else impl instead if they happen to be known at compile time.
+    return m_condImpl.dimensions();
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_condImpl.evalSubExprsIfNeeded(NULL);
+    m_thenImpl.evalSubExprsIfNeeded(NULL);
+    m_elseImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
+      m_thenImpl.evalSubExprsIfNeeded(
+          nullptr, [this, done](bool) { m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); }); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_condImpl.cleanup();
+    m_thenImpl.cleanup();
+    m_elseImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
+    return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
+  }
+
+  template <int LoadMode, bool UseTernary = TernaryPacketAccess, std::enable_if_t<!UseTernary, bool> = true>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    internal::Selector<PacketSize> select;
+    EIGEN_UNROLL_LOOP
+    for (Index i = 0; i < PacketSize; ++i) {
+      select.select[i] = m_condImpl.coeff(index + i);
+    }
+    return internal::pblend(select, m_thenImpl.template packet<LoadMode>(index),
+                            m_elseImpl.template packet<LoadMode>(index));
+  }
+
+  template <int LoadMode, bool UseTernary = TernaryPacketAccess, std::enable_if_t<UseTernary, bool> = true>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return TernarySelectOp().template packetOp<PacketReturnType>(m_thenImpl.template packet<LoadMode>(index),
+                                                                 m_elseImpl.template packet<LoadMode>(index),
+                                                                 m_condImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_condImpl.costPerCoeff(vectorized) +
+           m_thenImpl.costPerCoeff(vectorized).cwiseMax(m_elseImpl.costPerCoeff(vectorized));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    auto then_req = m_thenImpl.getResourceRequirements();
+    auto else_req = m_elseImpl.getResourceRequirements();
+
+    auto merged_req = internal::TensorBlockResourceRequirements::merge(then_req, else_req);
+    merged_req.cost_per_coeff = then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff);
+
+    return internal::TensorBlockResourceRequirements::merge(m_condImpl.getResourceRequirements(), merged_req);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    // It's unsafe to pass destination buffer to underlying expressions, because
+    // output might be aliased with one of the inputs.
+    desc.DropDestinationBuffer();
+
+    return TensorBlock(m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch),
+                       m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler& cgh) const {
+    m_condImpl.bind(cgh);
+    m_thenImpl.bind(cgh);
+    m_elseImpl.bind(cgh);
+  }
+#endif
+ private:
+  TensorEvaluator<IfArgType, Device> m_condImpl;
+  TensorEvaluator<ThenArgType, Device> m_thenImpl;
+  TensorEvaluator<ElseArgType, Device> m_elseImpl;
+};
+
+}  // end namespace Eigen
+
+#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP)
+template <typename Derived, typename Device>
+struct cl::sycl::is_device_copyable<
+    Eigen::TensorEvaluator<Derived, Device>,
+    std::enable_if_t<!std::is_trivially_copyable<Eigen::TensorEvaluator<Derived, Device>>::value>> : std::true_type {};
+#endif
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
new file mode 100644
index 00000000000..fe4b38c4813
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -0,0 +1,672 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/**
+ * \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ *
+ * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
+ *                      instructions)
+ * @tparam Tiling       can use block based tensor evaluation
+ *                      (see TensorBlock.h)
+ */
+namespace internal {
+
+/**
+ * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
+ * expensive. If expression has at least one broadcast op in it, and it supports
+ * block based evaluation, we always prefer it, even for the small tensors. For
+ * all other tileable ops, block evaluation overhead for small tensors (fits
+ * into L1) is too large, and we fallback on vectorized evaluation.
+ */
+
+// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
+
+template <typename Expression>
+struct ExpressionHasTensorBroadcastingOp {
+  enum { value = false };
+};
+
+template <typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<const TensorAssignOp<LhsXprType, RhsXprType> > {
+  enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
+};
+
+template <typename UnaryOp, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<const TensorCwiseUnaryOp<UnaryOp, XprType> > {
+  enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
+};
+
+template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
+  enum {
+    value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value || ExpressionHasTensorBroadcastingOp<RhsXprType>::value
+  };
+};
+
+template <typename Broadcast, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<const TensorBroadcastingOp<Broadcast, XprType> > {
+  enum { value = true };
+};
+
+// -------------------------------------------------------------------------- //
+
+/**
+ * Default strategy: the expression is evaluated sequentially with a single cpu
+ * thread, without vectorization and block evaluation.
+ */
+template <typename Expression, typename Device, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor {
+ public:
+  typedef typename Expression::Index StorageIndex;
+
+  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
+  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
+  // violation. If this template is instantiated with a non-default device, it
+  // means that this header file was included without defining
+  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
+  static_assert(std::is_same<Device, DefaultDevice>::value,
+                "Default executor instantiated with non-default device. "
+                "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
+                "EIGEN_USE_SYCL before including Eigen headers.");
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr, const Device& device = DefaultDevice()) {
+    TensorEvaluator<Expression, Device> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      for (StorageIndex i = 0; i < size; ++i) {
+        evaluator.evalScalar(i);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+/**
+ * Default async execution strategy is not implemented. Currently it's only
+ * available for ThreadPoolDevice (see definition below).
+ */
+template <typename Expression, typename Device, typename DoneCallback, bool Vectorizable, TiledEvaluation Tiling>
+class TensorAsyncExecutor {};
+
+/**
+ * Process all the data with a single cpu thread, using vectorized instructions.
+ */
+template <typename Expression>
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
+                     /*Tiling=*/TiledEvaluation::Off> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
+    TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      const int PacketSize =
+          unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
+      // unroll the loop at the expense of inlining.
+      const StorageIndex UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
+      }
+      const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+      for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+      for (StorageIndex i = VectorizedSize; i < size; ++i) {
+        evaluator.evalScalar(i);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+  typedef typename traits<Expression>::Index StorageIndex;
+
+  static constexpr int NumDims = traits<Expression>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                                        const DefaultDevice& device = DefaultDevice()) {
+    typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex> TensorBlockMapper;
+
+    typedef internal::TensorBlockDescriptor<NumDims, StorageIndex> TensorBlockDesc;
+    typedef internal::TensorBlockScratchAllocator<DefaultDevice> TensorBlockScratch;
+
+    Evaluator evaluator(expr, device);
+
+    // TODO(ezhulenev): Do not use tiling for small tensors?
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+
+    if (needs_assign) {
+      // Query expression tree for desired block size/shape.
+      const TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
+
+      const TensorBlockMapper block_mapper(typename TensorBlockDesc::Dimensions(evaluator.dimensions()), requirements);
+
+      // Share scratch memory allocator between all blocks.
+      TensorBlockScratch scratch(device);
+
+      const StorageIndex total_block_count = block_mapper.blockCount();
+      for (StorageIndex i = 0; i < total_block_count; ++i) {
+        TensorBlockDesc desc = block_mapper.blockDescriptor(i);
+        evaluator.evalBlock(desc, scratch);
+        scratch.reset();
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+/**
+ * Multicore strategy: the index space is partitioned and each partition is
+ * executed on a single core.
+ *
+ * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
+ *     pool, and will block the caller thread until all tasks are finished.
+ *
+ * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
+ *     the ThreadPoolDevice managed thread pool, and will return immediately.
+ *     It will call 'done' callback after all tasks are finished.
+ */
+#ifdef EIGEN_USE_THREADS
+
+template <typename TensorBlockMapper>
+struct TensorExecutorTilingContext {
+  TensorExecutorTilingContext() = default;
+  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, const TensorOpCost& b_cost, size_t b_aligned_size)
+      : block_mapper(b_mapper), cost(b_cost), aligned_blocksize(b_aligned_size) {}
+
+  TensorBlockMapper block_mapper;  // navigate through blocks
+  TensorOpCost cost;               // cost of computing a single block
+  size_t aligned_blocksize;        // block size after memory alignment
+};
+
+// Computes a block evaluation parameters, and allocates temporary memory buffer
+// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
+template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
+TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(const Evaluator& evaluator) {
+  // Query expression tree for desired block size/shape.
+  TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
+
+  // Update target block size based on cost model.
+  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, requirements.cost_per_coeff);
+  requirements.size = static_cast<size_t>(1.0 / taskSize);
+
+  TensorBlockMapper block_mapper(typename TensorBlockMapper::Dimensions(evaluator.dimensions()), requirements);
+
+  size_t block_size = block_mapper.blockTotalSize();
+  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+  const size_t aligned_blocksize =
+      align * numext::div_ceil<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
+
+  return {block_mapper, requirements.cost_per_coeff * block_size, aligned_blocksize};
+}
+
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
+struct EvalRange {
+  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) {
+    Evaluator evaluator = *evaluator_in;
+    eigen_assert(lastIdx >= firstIdx);
+    for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+
+  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
+};
+
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+  static constexpr int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+
+  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) {
+    Evaluator evaluator = *evaluator_in;
+    eigen_assert(lastIdx >= firstIdx);
+    StorageIndex i = firstIdx;
+    if (lastIdx - firstIdx >= PacketSize) {
+      eigen_assert(firstIdx % PacketSize == 0);
+      StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
+      // unroll the loop at the expense of inlining.
+      for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
+      }
+      last_chunk_offset = lastIdx - PacketSize;
+      for (; i <= last_chunk_offset; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+    }
+    for (; i < lastIdx; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+
+  static StorageIndex alignBlockSize(StorageIndex size) {
+    // Align block size to packet size and account for unrolling in run above.
+    if (size >= 16 * PacketSize) {
+      return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
+    }
+    // Aligning to 4 * PacketSize would increase block size by more than 25%.
+    return (size + PacketSize - 1) & ~(PacketSize - 1);
+  }
+};
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) {
+    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+
+    Evaluator evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      device.parallelFor(
+          size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
+          [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&evaluator, firstIdx, lastIdx); });
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  static constexpr int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) {
+    Evaluator evaluator(expr, device);
+
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const TilingContext tiling =
+          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(evaluator);
+
+      auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
+          TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
+          evaluator.evalBlock(desc, scratch);
+          scratch.reset();
+        }
+      };
+
+      // Evaluate small expressions directly as a single block.
+      if (tiling.block_mapper.blockCount() == 1) {
+        TensorBlockScratch scratch(device);
+        TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
+        evaluator.evalBlock(desc, scratch);
+      } else {
+        device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, eval_block);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Expression, typename DoneCallback, bool Vectorizable, TiledEvaluation Tiling>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) {
+    TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
+
+      typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+      const StorageIndex size = array_prod(ctx->evaluator.dimensions());
+      device.parallelForAsync(
+          size, ctx->evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
+          [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); },
+          [ctx]() { delete ctx; });
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done)
+        : evaluator(expr, thread_pool), on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      evaluator.cleanup();
+      on_done();
+    }
+
+    Evaluator evaluator;
+
+   private:
+    DoneCallback on_done;
+  };
+};
+
+template <typename Expression, typename DoneCallback, bool Vectorizable>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, /*Tileable*/ TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  static constexpr int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) {
+    TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
+
+      ctx->tiling = internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
+
+      auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(ctx->device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
+          TensorBlockDesc desc = ctx->tiling.block_mapper.blockDescriptor(block_idx);
+          ctx->evaluator.evalBlock(desc, scratch);
+          scratch.reset();
+        }
+      };
+
+      // Evaluate small expressions directly as a single block.
+      if (ctx->tiling.block_mapper.blockCount() == 1) {
+        TensorBlockScratch scratch(ctx->device);
+        TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
+        ctx->evaluator.evalBlock(desc, scratch);
+        delete ctx;
+      } else {
+        ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), ctx->tiling.cost, eval_block,
+                                     [ctx]() { delete ctx; });
+      }
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done)
+        : device(thread_pool), evaluator(expr, thread_pool), on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      evaluator.cleanup();
+      on_done();
+    }
+
+    const ThreadPoolDevice& device;
+    Evaluator evaluator;
+    TilingContext tiling;
+
+   private:
+    DoneCallback on_done;
+  };
+};
+
+#endif  // EIGEN_USE_THREADS
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+#if defined(EIGEN_GPUCC)
+// Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.
+template <typename Index>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs, Index rhs) {
+  const Index highest = NumTraits<Index>::highest();
+  const Index lowest = NumTraits<Index>::lowest();
+  if (lhs > 0 && rhs > 0) {
+    return lhs > highest - rhs ? 1 : 0;
+  } else if (lhs < 0 && rhs < 0) {
+    return lhs < lowest - rhs ? -1 : 0;
+  } else {
+    return 0;
+  }
+}
+
+// Returns lhs + rhs, saturating to the highest/lowest representable value on
+// overflow/underflow respectively.
+template <typename Index>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
+  const Index highest = NumTraits<Index>::highest();
+  const Index lowest = NumTraits<Index>::lowest();
+  int overflow = sum_will_overflow(lhs, rhs);
+  return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
+}
+
+// A functor that adds step_size to a given index, saturating to avoid
+// overflow/underflow. If overflow/underflow is not possible, regular addition
+// is used (for efficiency).
+template <typename Index>
+struct SafeStep {
+  // lastIdx is one past the end of the possible indexes.
+  // step_size is the value that will be added to the given index when the
+  // functor is called.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)
+      : can_overflow_(sum_will_overflow(lastIdx, step_size)), step_size_(step_size) {}
+
+  // Adds step_size to index, saturating on overflow (if overflow is possible).
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const {
+    return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
+  }
+
+ private:
+  const bool can_overflow_;
+  const Index step_size_;
+};
+
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
+struct EigenMetaKernelEval {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx,
+                                                        StorageIndex step_size) {
+    SafeStep<StorageIndex> safe_step(lastIdx, step_size);
+    for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
+      eval.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx,
+                                                        StorageIndex step_size) {
+    const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
+    const StorageIndex vectorized_step_size = step_size * PacketSize;
+
+    SafeStep<StorageIndex> safe_vectorized_step(vectorized_size, vectorized_step_size);
+    // Use the vector path
+    for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; i = safe_vectorized_step(i)) {
+      eval.evalPacket(i);
+    }
+    SafeStep<StorageIndex> safe_step(lastIdx, step_size);
+    for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx; i = safe_step(i)) {
+      eval.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename StorageIndex>
+__global__ void __launch_bounds__(1024) EigenMetaKernel(Evaluator eval, StorageIndex size) {
+  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const StorageIndex step_size = blockDim.x * gridDim.x;
+
+  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
+  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
+}
+
+/*static*/
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(const Expression& expr,
+                                                                                          const GpuDevice& device) {
+  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+  if (needs_assign) {
+    const int block_size = device.maxGpuThreadsPerBlock();
+    const int max_blocks = static_cast<int>(
+        numext::mini<int64_t>(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(),
+                              NumTraits<StorageIndex>::highest()) /
+        block_size);
+    const StorageIndex size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(
+        numext::mini<int>(max_blocks, static_cast<int>(numext::div_ceil<StorageIndex>(size, block_size))), 1);
+
+    LAUNCH_GPU_KERNEL((EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), num_blocks, block_size,
+                      0, device, evaluator, size);
+  }
+  evaluator.cleanup();
+}
+
+#endif  // EIGEN_GPUCC
+#endif  // EIGEN_USE_GPU
+
+// SYCL Executor policy
+#ifdef EIGEN_USE_SYCL
+
+template <typename Evaluator>
+struct ExecExprFunctorKernel {
+  typedef typename Evaluator::Index Index;
+  Evaluator evaluator;
+  const Index range;
+  template <typename Scratch>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(const Scratch, Evaluator evaluator_, const Index range_)
+      : evaluator(evaluator_), range(range_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { compute(itemID); }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec> compute(const cl::sycl::nd_item<1>& itemID) const {
+    Index gId = static_cast<Index>(itemID.get_global_linear_id());
+    Index total_threads = itemID.get_global_range(0);
+
+    for (Index i = gId; i < range; i += total_threads) {
+      evaluator.evalScalar(i);
+    }
+  }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec> compute(const cl::sycl::nd_item<1>& itemID) const {
+    const Index vectorizedRange = (range / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index gId = static_cast<Index>(itemID.get_global_linear_id());
+    const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    const Index start = Evaluator::PacketSize * gId;
+    for (Index i = start; i < vectorizedRange; i += step) {
+      evaluator.evalPacket(i);
+    }
+    gId += vectorizedRange;
+    for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index Index;
+  static EIGEN_STRONG_INLINE void run(const Expression& expr, const Eigen::SyclDevice& dev) {
+    typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
+    Evaluator evaluator(expr, dev);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      Index range, GRange, tileSize;
+      Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
+      total_size = (total_size == 0) ? 1 : total_size;
+      const int PacketSize = Eigen::PacketType<typename Evaluator::CoeffReturnType, Eigen::SyclDevice>::size;
+      Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
+      dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
+      range = total_size;
+
+      dev.template nullary_kernel_launcher<typename Evaluator::CoeffReturnType, ExecExprFunctorKernel<Evaluator> >(
+             evaluator, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+             range)
+          .wait();
+    }
+    evaluator.cleanup();
+  }
+};
+
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
new file mode 100644
index 00000000000..5f4c72367c6
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -0,0 +1,329 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorExpr
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor expression classes.
+ *
+ * The TensorCwiseNullaryOp class applies a nullary operators to an expression.
+ * This is typically used to generate constants.
+ *
+ * The TensorCwiseUnaryOp class represents an expression where a unary operator
+ * (e.g. cwiseSqrt) is applied to an expression.
+ *
+ * The TensorCwiseBinaryOp class represents an expression where a binary
+ * operator (e.g. addition) is applied to a lhs and a rhs expression.
+ *
+ */
+namespace internal {
+template <typename NullaryOp, typename XprType>
+struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> > : traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef std::remove_reference_t<XprTypeNested> XprTypeNested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+  enum { Flags = 0 };
+};
+
+}  // end namespace internal
+
+template <typename NullaryOp, typename XprType>
+class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
+  typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
+      : m_xpr(xpr), m_functor(func) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& nestedExpression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const NullaryOp m_functor;
+};
+
+namespace internal {
+template <typename UnaryOp, typename XprType>
+struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> > : traits<XprType> {
+  // TODO(phli): Add InputScalar, InputPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Input or Output.
+  typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef std::remove_reference_t<XprTypeNested> XprTypeNested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename TypeConversion<Scalar, typename XprTraits::PointerType>::type PointerType;
+};
+
+template <typename UnaryOp, typename XprType>
+struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense> {
+  typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
+};
+
+template <typename UnaryOp, typename XprType>
+struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type> {
+  typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename UnaryOp, typename XprType>
+class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors> {
+ public:
+  // TODO(phli): Add InputScalar, InputPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Input or Output.
+  typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef Scalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+      : m_xpr(xpr), m_functor(func) {}
+
+  EIGEN_DEVICE_FUNC const UnaryOp& functor() const { return m_functor; }
+
+  /** \returns the nested expression */
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& nestedExpression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const UnaryOp m_functor;
+};
+
+namespace internal {
+template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
+  // Type promotion to handle the case where the types of the lhs and the rhs
+  // are different.
+  // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Inputs or Output.
+  typedef typename result_of<BinaryOp(typename LhsXprType::Scalar, typename RhsXprType::Scalar)>::type Scalar;
+  typedef traits<LhsXprType> XprTraits;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename TypeConversion<Scalar,
+                                  std::conditional_t<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                                                     typename traits<LhsXprType>::PointerType,
+                                                     typename traits<RhsXprType>::PointerType> >::type PointerType;
+  enum { Flags = 0 };
+};
+
+template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense> {
+  typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
+};
+
+template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1,
+              typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type> {
+  typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
+class TensorCwiseBinaryOp
+    : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors> {
+ public:
+  // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Inputs or Output.
+  typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef Scalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs,
+                                                            const BinaryOp& func = BinaryOp())
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
+
+  EIGEN_DEVICE_FUNC const BinaryOp& functor() const { return m_functor; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename LhsXprType::Nested>& lhsExpression() const {
+    return m_lhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename RhsXprType::Nested>& rhsExpression() const {
+    return m_rhs_xpr;
+  }
+
+ protected:
+  typename LhsXprType::Nested m_lhs_xpr;
+  typename RhsXprType::Nested m_rhs_xpr;
+  const BinaryOp m_functor;
+};
+
+namespace internal {
+template <typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> > {
+  // Type promotion to handle the case where the types of the args are different.
+  typedef typename result_of<TernaryOp(typename Arg1XprType::Scalar, typename Arg2XprType::Scalar,
+                                       typename Arg3XprType::Scalar)>::type Scalar;
+  typedef traits<Arg1XprType> XprTraits;
+  typedef typename traits<Arg1XprType>::StorageKind StorageKind;
+  typedef typename traits<Arg1XprType>::Index Index;
+  typedef typename Arg1XprType::Nested Arg1Nested;
+  typedef typename Arg2XprType::Nested Arg2Nested;
+  typedef typename Arg3XprType::Nested Arg3Nested;
+  typedef std::remove_reference_t<Arg1Nested> Arg1Nested_;
+  typedef std::remove_reference_t<Arg2Nested> Arg2Nested_;
+  typedef std::remove_reference_t<Arg3Nested> Arg3Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename TypeConversion<Scalar,
+                                  std::conditional_t<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
+                                                     typename traits<Arg2XprType>::PointerType,
+                                                     typename traits<Arg3XprType>::PointerType> >::type PointerType;
+  enum { Flags = 0 };
+};
+
+template <typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense> {
+  typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
+};
+
+template <typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1,
+              typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type> {
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp
+    : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef Scalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2,
+                                                             const Arg3XprType& arg3,
+                                                             const TernaryOp& func = TernaryOp())
+      : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
+
+  EIGEN_DEVICE_FUNC const TernaryOp& functor() const { return m_functor; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename Arg1XprType::Nested>& arg1Expression() const {
+    return m_arg1_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename Arg2XprType::Nested>& arg2Expression() const {
+    return m_arg2_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename Arg3XprType::Nested>& arg3Expression() const {
+    return m_arg3_xpr;
+  }
+
+ protected:
+  typename Arg1XprType::Nested m_arg1_xpr;
+  typename Arg2XprType::Nested m_arg2_xpr;
+  typename Arg3XprType::Nested m_arg3_xpr;
+  const TernaryOp m_functor;
+};
+
+namespace internal {
+template <typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > : traits<ThenXprType> {
+  typedef typename traits<ThenXprType>::Scalar Scalar;
+  typedef traits<ThenXprType> XprTraits;
+  typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
+                                        typename traits<ElseXprType>::StorageKind>::ret StorageKind;
+  typedef
+      typename promote_index_type<typename traits<ElseXprType>::Index, typename traits<ThenXprType>::Index>::type Index;
+  typedef typename IfXprType::Nested IfNested;
+  typedef typename ThenXprType::Nested ThenNested;
+  typedef typename ElseXprType::Nested ElseNested;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef std::conditional_t<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val,
+                             typename traits<ThenXprType>::PointerType, typename traits<ElseXprType>::PointerType>
+      PointerType;
+};
+
+template <typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense> {
+  typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
+};
+
+template <typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1,
+              typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type> {
+  typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
+};
+
+}  // end namespace internal
+
+template <typename IfXprType, typename ThenXprType, typename ElseXprType>
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
+                                                  typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC TensorSelectOp(const IfXprType& a_condition, const ThenXprType& a_then, const ElseXprType& a_else)
+      : m_condition(a_condition), m_then(a_then), m_else(a_else) {}
+
+  EIGEN_DEVICE_FUNC const IfXprType& ifExpression() const { return m_condition; }
+
+  EIGEN_DEVICE_FUNC const ThenXprType& thenExpression() const { return m_then; }
+
+  EIGEN_DEVICE_FUNC const ElseXprType& elseExpression() const { return m_else; }
+
+ protected:
+  typename IfXprType::Nested m_condition;
+  typename ThenXprType::Nested m_then;
+  typename ElseXprType::Nested m_else;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
new file mode 100644
index 00000000000..f5d1b2da357
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -0,0 +1,674 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorFFT
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor FFT class.
+ *
+ * TODO:
+ * Vectorize the Cooley Tukey and the Bluestein algorithm
+ * Add support for multithreaded evaluation
+ * Improve the performance on GPU
+ */
+
+template <bool NeedUprade>
+struct MakeComplex {
+  template <typename T>
+  EIGEN_DEVICE_FUNC T operator()(const T& val) const {
+    return val;
+  }
+};
+
+template <>
+struct MakeComplex<true> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC std::complex<T> operator()(const T& val) const {
+    return std::complex<T>(val, 0);
+  }
+};
+
+template <>
+struct MakeComplex<false> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC std::complex<T> operator()(const std::complex<T>& val) const {
+    return val;
+  }
+};
+
+template <int ResultType>
+struct PartOf {
+  template <typename T>
+  T operator()(const T& val) const {
+    return val;
+  }
+};
+
+template <>
+struct PartOf<RealPart> {
+  template <typename T>
+  T operator()(const std::complex<T>& val) const {
+    return val.real();
+  }
+};
+
+template <>
+struct PartOf<ImagPart> {
+  template <typename T>
+  T operator()(const std::complex<T>& val) const {
+    return val.imag();
+  }
+};
+
+namespace internal {
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename NumTraits<typename XprTraits::Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename XprTraits::Scalar InputScalar;
+  typedef std::conditional_t<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>
+      OutputScalar;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename traits<XprType>::PointerType PointerType;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> {
+  typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1,
+              typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> {
+  typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type;
+};
+
+}  // end namespace internal
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef std::conditional_t<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>
+      OutputScalar;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) : m_xpr(expr), m_fft(fft) {}
+
+  EIGEN_DEVICE_FUNC const FFT& fft() const { return m_fft; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const FFT m_fft;
+};
+
+// Eval as rvalue
+template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir>
+struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> {
+  typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  typedef internal::traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar InputScalar;
+  typedef std::conditional_t<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>
+      OutputScalar;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
+  static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = true,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+    m_size = m_dimensions.TotalSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalToBuf(data);
+      return false;
+    } else {
+      m_data = (EvaluatorPointerType)m_device.get(
+          (CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
+      evalToBuf(m_data);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_data) {
+      m_device.deallocate(m_data);
+      m_data = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { return m_data[index]; }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
+    const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
+    ComplexScalar* buf =
+        write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
+
+    for (Index i = 0; i < m_size; ++i) {
+      buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i));
+    }
+
+    for (size_t i = 0; i < m_fft.size(); ++i) {
+      Index dim = m_fft[i];
+      eigen_assert(dim >= 0 && dim < NumDims);
+      Index line_len = m_dimensions[dim];
+      eigen_assert(line_len >= 1);
+      ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len);
+      const bool is_power_of_two = isPowerOfTwo(line_len);
+      const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
+      const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
+
+      ComplexScalar* a =
+          is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+      ComplexScalar* b =
+          is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+      ComplexScalar* pos_j_base_powered =
+          is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1));
+      if (!is_power_of_two) {
+        // Compute twiddle factors
+        //   t_n = exp(sqrt(-1) * pi * n^2 / line_len)
+        // for n = 0, 1,..., line_len-1.
+        // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
+
+        // The recurrence is correct in exact arithmetic, but causes
+        // numerical issues for large transforms, especially in
+        // single-precision floating point.
+        //
+        // pos_j_base_powered[0] = ComplexScalar(1, 0);
+        // if (line_len > 1) {
+        //   const ComplexScalar pos_j_base = ComplexScalar(
+        //       numext::cos(M_PI / line_len), numext::sin(M_PI / line_len));
+        //   pos_j_base_powered[1] = pos_j_base;
+        //   if (line_len > 2) {
+        //     const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
+        //     for (int i = 2; i < line_len + 1; ++i) {
+        //       pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
+        //           pos_j_base_powered[i - 1] /
+        //           pos_j_base_powered[i - 2] *
+        //           pos_j_base_sq;
+        //     }
+        //   }
+        // }
+        // TODO(rmlarsen): Find a way to use Eigen's vectorized sin
+        // and cosine functions here.
+        for (int j = 0; j < line_len + 1; ++j) {
+          double arg = ((EIGEN_PI * j) * j) / line_len;
+          std::complex<double> tmp(numext::cos(arg), numext::sin(arg));
+          pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp);
+        }
+      }
+
+      for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) {
+        const Index base_offset = getBaseOffsetFromIndex(partial_index, dim);
+
+        // get data into line_buf
+        const Index stride = m_strides[dim];
+        if (stride == 1) {
+          m_device.memcpy(line_buf, &buf[base_offset], line_len * sizeof(ComplexScalar));
+        } else {
+          Index offset = base_offset;
+          for (int j = 0; j < line_len; ++j, offset += stride) {
+            line_buf[j] = buf[offset];
+          }
+        }
+
+        // process the line
+        if (is_power_of_two) {
+          processDataLineCooleyTukey(line_buf, line_len, log_len);
+        } else {
+          processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered);
+        }
+
+        // write back
+        if (FFTDir == FFT_FORWARD && stride == 1) {
+          m_device.memcpy(&buf[base_offset], line_buf, line_len * sizeof(ComplexScalar));
+        } else {
+          Index offset = base_offset;
+          const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0);
+          for (int j = 0; j < line_len; ++j, offset += stride) {
+            buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor;
+          }
+        }
+      }
+      m_device.deallocate(line_buf);
+      if (!is_power_of_two) {
+        m_device.deallocate(a);
+        m_device.deallocate(b);
+        m_device.deallocate(pos_j_base_powered);
+      }
+    }
+
+    if (!write_to_out) {
+      for (Index i = 0; i < m_size; ++i) {
+        data[i] = PartOf<FFTResultType>()(buf[i]);
+      }
+      m_device.deallocate(buf);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) {
+    eigen_assert(x > 0);
+    return !(x & (x - 1));
+  }
+
+  // The composite number for padding, used in Bluestein's FFT algorithm
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) {
+    Index i = 2;
+    while (i < 2 * n - 1) i *= 2;
+    return i;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) {
+    Index log2m = 0;
+    while (m >>= 1) log2m++;
+    return log2m;
+  }
+
+  // Call Cooley Tukey algorithm directly, data length must be power of 2
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len,
+                                                                        Index log_len) {
+    eigen_assert(isPowerOfTwo(line_len));
+    scramble_FFT(line_buf, line_len);
+    compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len);
+  }
+
+  // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len,
+                                                                      Index good_composite, Index log_len,
+                                                                      ComplexScalar* a, ComplexScalar* b,
+                                                                      const ComplexScalar* pos_j_base_powered) {
+    Index n = line_len;
+    Index m = good_composite;
+    ComplexScalar* data = line_buf;
+
+    for (Index i = 0; i < n; ++i) {
+      if (FFTDir == FFT_FORWARD) {
+        a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
+      } else {
+        a[i] = data[i] * pos_j_base_powered[i];
+      }
+    }
+    for (Index i = n; i < m; ++i) {
+      a[i] = ComplexScalar(0, 0);
+    }
+
+    for (Index i = 0; i < n; ++i) {
+      if (FFTDir == FFT_FORWARD) {
+        b[i] = pos_j_base_powered[i];
+      } else {
+        b[i] = numext::conj(pos_j_base_powered[i]);
+      }
+    }
+    for (Index i = n; i < m - n; ++i) {
+      b[i] = ComplexScalar(0, 0);
+    }
+    for (Index i = m - n; i < m; ++i) {
+      if (FFTDir == FFT_FORWARD) {
+        b[i] = pos_j_base_powered[m - i];
+      } else {
+        b[i] = numext::conj(pos_j_base_powered[m - i]);
+      }
+    }
+
+    scramble_FFT(a, m);
+    compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len);
+
+    scramble_FFT(b, m);
+    compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len);
+
+    for (Index i = 0; i < m; ++i) {
+      a[i] *= b[i];
+    }
+
+    scramble_FFT(a, m);
+    compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len);
+
+    // Do the scaling after ifft
+    for (Index i = 0; i < m; ++i) {
+      a[i] /= m;
+    }
+
+    for (Index i = 0; i < n; ++i) {
+      if (FFTDir == FFT_FORWARD) {
+        data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
+      } else {
+        data[i] = a[i] * pos_j_base_powered[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) {
+    eigen_assert(isPowerOfTwo(n));
+    Index j = 1;
+    for (Index i = 1; i < n; ++i) {
+      if (j > i) {
+        std::swap(data[j - 1], data[i - 1]);
+      }
+      Index m = n >> 1;
+      while (m >= 2 && j > m) {
+        j -= m;
+        m >>= 1;
+      }
+      j += m;
+    }
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) {
+    ComplexScalar tmp = data[1];
+    data[1] = data[0] - data[1];
+    data[0] += tmp;
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) {
+    ComplexScalar tmp[4];
+    tmp[0] = data[0] + data[1];
+    tmp[1] = data[0] - data[1];
+    tmp[2] = data[2] + data[3];
+    if (Dir == FFT_FORWARD) {
+      tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]);
+    } else {
+      tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]);
+    }
+    data[0] = tmp[0] + tmp[2];
+    data[1] = tmp[1] + tmp[3];
+    data[2] = tmp[0] - tmp[2];
+    data[3] = tmp[1] - tmp[3];
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) {
+    ComplexScalar tmp_1[8];
+    ComplexScalar tmp_2[8];
+
+    tmp_1[0] = data[0] + data[1];
+    tmp_1[1] = data[0] - data[1];
+    tmp_1[2] = data[2] + data[3];
+    if (Dir == FFT_FORWARD) {
+      tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1);
+    } else {
+      tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1);
+    }
+    tmp_1[4] = data[4] + data[5];
+    tmp_1[5] = data[4] - data[5];
+    tmp_1[6] = data[6] + data[7];
+    if (Dir == FFT_FORWARD) {
+      tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1);
+    } else {
+      tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1);
+    }
+    tmp_2[0] = tmp_1[0] + tmp_1[2];
+    tmp_2[1] = tmp_1[1] + tmp_1[3];
+    tmp_2[2] = tmp_1[0] - tmp_1[2];
+    tmp_2[3] = tmp_1[1] - tmp_1[3];
+    tmp_2[4] = tmp_1[4] + tmp_1[6];
+// SQRT2DIV2 = sqrt(2)/2
+#define SQRT2DIV2 0.7071067811865476
+    if (Dir == FFT_FORWARD) {
+      tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2);
+      tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1);
+      tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2);
+    } else {
+      tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2);
+      tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1);
+      tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2);
+    }
+    data[0] = tmp_2[0] + tmp_2[4];
+    data[1] = tmp_2[1] + tmp_2[5];
+    data[2] = tmp_2[2] + tmp_2[6];
+    data[3] = tmp_2[3] + tmp_2[7];
+    data[4] = tmp_2[0] - tmp_2[4];
+    data[5] = tmp_2[1] - tmp_2[5];
+    data[6] = tmp_2[2] - tmp_2[6];
+    data[7] = tmp_2[3] - tmp_2[7];
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(ComplexScalar* data, Index n, Index n_power_of_2) {
+    // Original code:
+    // RealScalar wtemp = std::sin(M_PI/n);
+    // RealScalar wpi =  -std::sin(2 * M_PI/n);
+    const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2];
+    const RealScalar wpi =
+        (Dir == FFT_FORWARD) ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
+
+    const ComplexScalar wp(wtemp, wpi);
+    const ComplexScalar wp_one = wp + ComplexScalar(1, 0);
+    const ComplexScalar wp_one_2 = wp_one * wp_one;
+    const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
+    const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
+    const Index n2 = n / 2;
+    ComplexScalar w(1.0, 0.0);
+    for (Index i = 0; i < n2; i += 4) {
+      ComplexScalar temp0(data[i + n2] * w);
+      ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
+      ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
+      ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3);
+      w = w * wp_one_4;
+
+      data[i + n2] = data[i] - temp0;
+      data[i] += temp0;
+
+      data[i + 1 + n2] = data[i + 1] - temp1;
+      data[i + 1] += temp1;
+
+      data[i + 2 + n2] = data[i + 2] - temp2;
+      data[i + 2] += temp2;
+
+      data[i + 3 + n2] = data[i + 3] - temp3;
+      data[i + 3] += temp3;
+    }
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(ComplexScalar* data, Index n, Index n_power_of_2) {
+    eigen_assert(isPowerOfTwo(n));
+    if (n > 8) {
+      compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
+      compute_1D_Butterfly<Dir>(data + n / 2, n / 2, n_power_of_2 - 1);
+      butterfly_1D_merge<Dir>(data, n, n_power_of_2);
+    } else if (n == 8) {
+      butterfly_8<Dir>(data);
+    } else if (n == 4) {
+      butterfly_4<Dir>(data);
+    } else if (n == 2) {
+      butterfly_2<Dir>(data);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
+    Index result = 0;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > omitted_dim; --i) {
+        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+        const Index idx = index / partial_m_stride;
+        index -= idx * partial_m_stride;
+        result += idx * m_strides[i];
+      }
+      result += index;
+    } else {
+      for (Index i = 0; i < omitted_dim; ++i) {
+        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+        const Index idx = index / partial_m_stride;
+        index -= idx * partial_m_stride;
+        result += idx * m_strides[i];
+      }
+      result += index;
+    }
+    // Value of index_coords[omitted_dim] is not determined to this step
+    return result;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const {
+    Index result = base + offset * m_strides[omitted_dim];
+    return result;
+  }
+
+ protected:
+  Index m_size;
+  const FFT EIGEN_DEVICE_REF m_fft;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  EvaluatorPointerType m_data;
+  const Device EIGEN_DEVICE_REF m_device;
+
+  // This will support a maximum FFT size of 2^32 for each dimension
+  // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
+  const RealScalar m_sin_PI_div_n_LUT[32] = {RealScalar(0.0),
+                                             RealScalar(-2),
+                                             RealScalar(-0.999999999999999),
+                                             RealScalar(-0.292893218813453),
+                                             RealScalar(-0.0761204674887130),
+                                             RealScalar(-0.0192147195967696),
+                                             RealScalar(-0.00481527332780311),
+                                             RealScalar(-0.00120454379482761),
+                                             RealScalar(-3.01181303795779e-04),
+                                             RealScalar(-7.52981608554592e-05),
+                                             RealScalar(-1.88247173988574e-05),
+                                             RealScalar(-4.70619042382852e-06),
+                                             RealScalar(-1.17654829809007e-06),
+                                             RealScalar(-2.94137117780840e-07),
+                                             RealScalar(-7.35342821488550e-08),
+                                             RealScalar(-1.83835707061916e-08),
+                                             RealScalar(-4.59589268710903e-09),
+                                             RealScalar(-1.14897317243732e-09),
+                                             RealScalar(-2.87243293150586e-10),
+                                             RealScalar(-7.18108232902250e-11),
+                                             RealScalar(-1.79527058227174e-11),
+                                             RealScalar(-4.48817645568941e-12),
+                                             RealScalar(-1.12204411392298e-12),
+                                             RealScalar(-2.80511028480785e-13),
+                                             RealScalar(-7.01277571201985e-14),
+                                             RealScalar(-1.75319392800498e-14),
+                                             RealScalar(-4.38298482001247e-15),
+                                             RealScalar(-1.09574620500312e-15),
+                                             RealScalar(-2.73936551250781e-16),
+                                             RealScalar(-6.84841378126949e-17),
+                                             RealScalar(-1.71210344531737e-17),
+                                             RealScalar(-4.28025861329343e-18)};
+
+  // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i));
+  const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {RealScalar(0.0),
+                                                     RealScalar(0.0),
+                                                     RealScalar(-1.00000000000000e+00),
+                                                     RealScalar(-7.07106781186547e-01),
+                                                     RealScalar(-3.82683432365090e-01),
+                                                     RealScalar(-1.95090322016128e-01),
+                                                     RealScalar(-9.80171403295606e-02),
+                                                     RealScalar(-4.90676743274180e-02),
+                                                     RealScalar(-2.45412285229123e-02),
+                                                     RealScalar(-1.22715382857199e-02),
+                                                     RealScalar(-6.13588464915448e-03),
+                                                     RealScalar(-3.06795676296598e-03),
+                                                     RealScalar(-1.53398018628477e-03),
+                                                     RealScalar(-7.66990318742704e-04),
+                                                     RealScalar(-3.83495187571396e-04),
+                                                     RealScalar(-1.91747597310703e-04),
+                                                     RealScalar(-9.58737990959773e-05),
+                                                     RealScalar(-4.79368996030669e-05),
+                                                     RealScalar(-2.39684498084182e-05),
+                                                     RealScalar(-1.19842249050697e-05),
+                                                     RealScalar(-5.99211245264243e-06),
+                                                     RealScalar(-2.99605622633466e-06),
+                                                     RealScalar(-1.49802811316901e-06),
+                                                     RealScalar(-7.49014056584716e-07),
+                                                     RealScalar(-3.74507028292384e-07),
+                                                     RealScalar(-1.87253514146195e-07),
+                                                     RealScalar(-9.36267570730981e-08),
+                                                     RealScalar(-4.68133785365491e-08),
+                                                     RealScalar(-2.34066892682746e-08),
+                                                     RealScalar(-1.17033446341373e-08),
+                                                     RealScalar(-5.85167231706864e-09),
+                                                     RealScalar(-2.92583615853432e-09)};
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FFT_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
new file mode 100644
index 00000000000..7f229cb063d
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -0,0 +1,226 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorFixedSize
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The fixed sized version of the tensor class.
+ *
+ * The fixed sized equivalent of
+ * Eigen::Tensor<float, 3> t(3, 5, 7);
+ * is
+ * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t;
+ */
+
+template <typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
+class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > {
+ public:
+  typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self;
+  typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base;
+  typedef typename Eigen::internal::nested<Self>::type Nested;
+  typedef typename internal::traits<Self>::StorageKind StorageKind;
+  typedef typename internal::traits<Self>::Index Index;
+  typedef Scalar_ Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename Base::CoeffReturnType CoeffReturnType;
+
+  static constexpr int Options = Options_;
+  static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+
+  enum {
+    IsAligned = bool(EIGEN_MAX_ALIGN_BYTES > 0),
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = true,
+    RawAccess = true
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  typedef Dimensions_ Dimensions;
+  static constexpr std::size_t NumIndices = Dimensions::count;
+
+ protected:
+  TensorStorage<Scalar, Dimensions, Options> m_storage;
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions dimensions() const { return m_storage.dimensions(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_storage.data(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_storage.data(); }
+
+  // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+  // work, because that uses base().coeffRef() - and we don't yet
+  // implement a similar class hierarchy
+  inline Self& base() { return *this; }
+  inline const Self& base() const { return *this; }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const {
+    eigen_internal_assert(checkIndexRange(indices));
+    return m_storage.data()[linearizedIndex(indices)];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_storage.data()[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return m_storage.data()[0];
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) {
+    eigen_internal_assert(checkIndexRange(indices));
+    return m_storage.data()[linearizedIndex(indices)];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_storage.data()[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return m_storage.data()[0];
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const {
+    eigen_assert(checkIndexRange(indices));
+    return coeff(indices);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const {
+    eigen_internal_assert(index >= 0 && index < size());
+    return coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeff();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const {
+    // The bracket operator is only for vectors, use the parenthesis operator instead.
+    EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeff(index);
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) {
+    eigen_assert(checkIndexRange(indices));
+    return coeffRef(indices);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) {
+    eigen_assert(index >= 0 && index < size());
+    return coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeffRef();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) {
+    // The bracket operator is only for vectors, use the parenthesis operator instead
+    EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize() : m_storage() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) : Base(other), m_storage(other.m_storage) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) : m_storage(other.m_storage) {}
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) {
+    typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
+    Assign assign(*this, other.derived());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+  }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, WriteAccessors>& other) {
+    typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
+    Assign assign(*this, other.derived());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+  }
+
+  // FIXME: check that the dimensions of other match the dimensions of *this.
+  // Unfortunately this isn't possible yet when the rhs is an expression.
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize)
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const {
+    using internal::array_apply_and_reduce;
+    using internal::array_zip_and_reduce;
+    using internal::greater_equal_zero_op;
+    using internal::lesser_op;
+    using internal::logical_and_op;
+
+    return true;
+    // check whether the indices are all >= 0
+    /*       array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+  // check whether the indices fit in the dimensions
+  array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const {
+    if (Options & RowMajor) {
+      return m_storage.dimensions().IndexOfRowMajor(indices);
+    } else {
+      return m_storage.dimensions().IndexOfColMajor(indices);
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
new file mode 100644
index 00000000000..d0fbfb38a89
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -0,0 +1,233 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include <memory>
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ *
+ *
+ */
+namespace internal {
+template <typename XprType>
+struct traits<TensorForcedEvalOp<XprType>> {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+
+  enum { Flags = 0 };
+};
+
+template <typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense> {
+  typedef const TensorForcedEvalOp<XprType>& type;
+};
+
+template <typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType>>::type> {
+  typedef TensorForcedEvalOp<XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) : m_xpr(expr) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+};
+
+namespace internal {
+template <typename Device, typename CoeffReturnType>
+struct non_integral_type_placement_new {
+  template <typename StorageType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) {
+    // Initialize non-trivially constructible types.
+    if (!internal::is_arithmetic<CoeffReturnType>::value) {
+      for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType();
+    }
+  }
+};
+
+// SYCL does not support non-integral types
+// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices
+// no matching function for call to 'operator new'
+template <typename CoeffReturnType>
+struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> {
+  template <typename StorageType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {}
+};
+}  // end namespace internal
+
+template <typename Device>
+class DeviceTempPointerHolder {
+ public:
+  DeviceTempPointerHolder(const Device& device, size_t size)
+      : device_(device), size_(size), ptr_(device.allocate_temp(size)) {}
+
+  ~DeviceTempPointerHolder() {
+    device_.deallocate_temp(ptr_);
+    size_ = 0;
+    ptr_ = nullptr;
+  }
+
+  void* ptr() { return ptr_; }
+
+ private:
+  Device device_;
+  size_t size_;
+  void* ptr_;
+};
+
+template <typename ArgType_, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device> {
+  typedef const internal::remove_all_t<ArgType_> ArgType;
+  typedef TensorForcedEvalOp<ArgType> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
+    PreferBlockAccess = false,
+    RawAccess = true
+  };
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  static constexpr int NumDims = internal::traits<ArgType>::NumDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_op(op.expression()),
+        m_device(device),
+        m_buffer_holder(nullptr),
+        m_buffer(nullptr) {}
+
+  ~TensorEvaluator() { cleanup(); }
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    const Index numValues = internal::array_prod(m_impl.dimensions());
+    m_buffer_holder = std::make_shared<DeviceTempPointerHolder<Device>>(m_device, numValues * sizeof(CoeffReturnType));
+    m_buffer = static_cast<EvaluatorPointerType>(m_buffer_holder->ptr());
+
+    internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
+
+    typedef TensorEvalToOp<const std::remove_const_t<ArgType>> EvalTo;
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+    internal::TensorExecutor<const EvalTo, std::remove_const_t<Device>,
+                             /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+                             /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::run(evalToTmp, m_device);
+
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    const Index numValues = internal::array_prod(m_impl.dimensions());
+    m_buffer_holder = std::make_shared<DeviceTempPointerHolder<Device>>(m_device, numValues * sizeof(CoeffReturnType));
+    m_buffer = static_cast<EvaluatorPointerType>(m_buffer_holder->ptr());
+
+    typedef TensorEvalToOp<const std::remove_const_t<ArgType>> EvalTo;
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+    auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); }, std::move(done));
+    internal::TensorAsyncExecutor<
+        const EvalTo, std::remove_const_t<Device>, decltype(on_done),
+        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::runAsync(evalToTmp, m_device,
+                                                                                 std::move(on_done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_buffer_holder = nullptr;
+    m_buffer = nullptr;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_buffer != nullptr);
+    return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EvaluatorPointerType data() const { return m_buffer; }
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const ArgType m_op;
+  const Device EIGEN_DEVICE_REF m_device;
+  std::shared_ptr<DeviceTempPointerHolder<Device>> m_buffer_holder;
+  EvaluatorPointerType m_buffer;  // Cached copy of the value stored in m_buffer_holder.
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
new file mode 100644
index 00000000000..49c20a4e285
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -0,0 +1,215 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// MakePointer class is used as a container of the address space of the pointer
+// on the host and on the device. From the host side it generates the T* pointer
+// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
+// T* m_data on the host. It is always called on the device.
+// Specialisation of MakePointer class for creating the sycl buffer with
+// map_allocator.
+template <typename T>
+struct MakePointer {
+  typedef T* Type;
+  typedef const T* ConstType;
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) {
+  return const_cast<T*>(data);
+}
+
+// The StorageMemory class is a container of the device specific pointer
+// used for referring to a Pointer on TensorEvaluator class. While the TensorExpression
+// is a device-agnostic type and need MakePointer class for type conversion,
+// the TensorEvaluator class can be specialized for a device, hence it is possible
+// to construct different types of temporary storage memory in TensorEvaluator
+// for different devices by specializing the following StorageMemory class.
+template <typename T, typename device>
+struct StorageMemory : MakePointer<T> {};
+
+namespace internal {
+template <typename A, typename B>
+struct Pointer_type_promotion {
+  static const bool val = false;
+};
+template <typename A>
+struct Pointer_type_promotion<A, A> {
+  static const bool val = true;
+};
+template <typename A, typename B>
+struct TypeConversion {
+  typedef A* type;
+};
+}  // namespace internal
+
+template <typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer>
+class TensorMap;
+template <typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex>
+class Tensor;
+template <typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex>
+class TensorFixedSize;
+template <typename PlainObjectType>
+class TensorRef;
+template <typename Derived, int AccessLevel>
+class TensorBase;
+
+template <typename NullaryOp, typename PlainObjectType>
+class TensorCwiseNullaryOp;
+template <typename UnaryOp, typename XprType>
+class TensorCwiseUnaryOp;
+template <typename BinaryOp, typename LeftXprType, typename RightXprType>
+class TensorCwiseBinaryOp;
+template <typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp;
+template <typename IfXprType, typename ThenXprType, typename ElseXprType>
+class TensorSelectOp;
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_ = MakePointer>
+class TensorReductionOp;
+template <typename XprType>
+class TensorIndexPairOp;
+template <typename ReduceOp, typename Dims, typename XprType>
+class TensorPairReducerOp;
+template <typename Axis, typename LeftXprType, typename RightXprType>
+class TensorConcatenationOp;
+template <typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType>
+class TensorContractionOp;
+template <typename TargetType, typename XprType>
+class TensorConversionOp;
+template <typename Dimensions, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp;
+template <typename FFT, typename XprType, int FFTDataType, int FFTDirection>
+class TensorFFTOp;
+template <typename PatchDim, typename XprType>
+class TensorPatchOp;
+template <DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorImagePatchOp;
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorVolumePatchOp;
+template <typename Broadcast, typename XprType>
+class TensorBroadcastingOp;
+template <DenseIndex DimId, typename XprType>
+class TensorChippingOp;
+template <typename NewDimensions, typename XprType>
+class TensorReshapingOp;
+template <typename XprType>
+class TensorLayoutSwapOp;
+template <typename StartIndices, typename Sizes, typename XprType>
+class TensorSlicingOp;
+template <typename ReverseDimensions, typename XprType>
+class TensorReverseOp;
+template <typename Rolls, typename XprType>
+class TensorRollOp;
+template <typename PaddingDimensions, typename XprType>
+class TensorPaddingOp;
+template <typename Shuffle, typename XprType>
+class TensorShufflingOp;
+template <typename Strides, typename XprType>
+class TensorStridingOp;
+template <typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+class TensorStridingSlicingOp;
+template <typename Strides, typename XprType>
+class TensorInflationOp;
+template <typename Generator, typename XprType>
+class TensorGeneratorOp;
+template <typename LeftXprType, typename RightXprType>
+class TensorAssignOp;
+template <typename Op, typename XprType>
+class TensorScanOp;
+template <typename Dims, typename XprType>
+class TensorTraceOp;
+
+template <typename CustomUnaryFunc, typename XprType>
+class TensorCustomUnaryOp;
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+class TensorCustomBinaryOp;
+
+template <typename XprType, template <class> class MakePointer_ = MakePointer>
+class TensorEvalToOp;
+template <typename XprType>
+class TensorForcedEvalOp;
+
+template <typename ExpressionType, typename DeviceType>
+class TensorDevice;
+template <typename ExpressionType, typename DeviceType, typename DoneCallback>
+class TensorAsyncDevice;
+template <typename Derived, typename Device>
+struct TensorEvaluator;
+
+struct NoOpOutputKernel;
+
+struct DefaultDevice;
+struct ThreadPoolDevice;
+struct GpuDevice;
+struct SyclDevice;
+
+#ifdef EIGEN_USE_SYCL
+namespace TensorSycl {
+namespace internal {
+template <typename Evaluator, typename Op>
+class GenericNondeterministicReducer;
+}
+}  // namespace TensorSycl
+#endif
+
+enum FFTResultType { RealPart = 0, ImagPart = 1, BothParts = 2 };
+
+enum FFTDirection { FFT_FORWARD = 0, FFT_REVERSE = 1 };
+
+namespace internal {
+
+template <typename Device, typename Expression>
+struct IsVectorizable {
+  static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
+};
+
+template <typename Expression>
+struct IsVectorizable<GpuDevice, Expression> {
+  static const bool value =
+      TensorEvaluator<Expression, GpuDevice>::PacketAccess && TensorEvaluator<Expression, GpuDevice>::IsAligned;
+};
+
+// Tiled evaluation strategy.
+enum TiledEvaluation {
+  Off = 0,  // tiled evaluation is not supported
+  On = 1,   // still work in progress (see TensorBlock.h)
+};
+
+template <typename Device, typename Expression>
+struct IsTileable {
+  // Check that block evaluation is supported and it's a preferred option (at
+  // least one sub-expression has much faster block evaluation, e.g.
+  // broadcasting).
+  static constexpr bool BlockAccess =
+      TensorEvaluator<Expression, Device>::BlockAccess && TensorEvaluator<Expression, Device>::PreferBlockAccess;
+
+  static const TiledEvaluation value = BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off;
+};
+
+template <typename Expression, typename Device, bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorExecutor;
+
+template <typename Expression, typename Device, typename DoneCallback,
+          bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorAsyncExecutor;
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
new file mode 100644
index 00000000000..7a87594c525
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -0,0 +1,422 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the modulo between an array and a scalar.
+ */
+template <typename Scalar>
+struct scalar_mod_op {
+  EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return a % m_divisor; }
+  const Scalar m_divisor;
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod_op<Scalar> > {
+  enum { Cost = scalar_div_cost<Scalar, false>::value, PacketAccess = false };
+};
+
+/** \internal
+ * \brief Template functor to compute the modulo between 2 arrays.
+ */
+template <typename Scalar>
+struct scalar_mod2_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { return a % b; }
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod2_op<Scalar> > {
+  enum { Cost = scalar_div_cost<Scalar, false>::value, PacketAccess = false };
+};
+
+template <typename Scalar>
+struct scalar_fmod_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+    return numext::fmod(a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_fmod_op<Scalar> > {
+  enum {
+    Cost = 13,  // Reciprocal throughput of FPREM on Haswell.
+    PacketAccess = false
+  };
+};
+
+template <typename Reducer, typename Device>
+struct reducer_traits {
+  enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true };
+};
+
+// Standard reduction functors
+template <typename T>
+struct SumReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = padd<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    internal::scalar_cast_op<int, T> conv;
+    return conv(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    internal::scalar_sum_op<T> sum_op;
+    return sum_op(saccum, predux(vaccum));
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd,
+    IsStateful = false,
+    IsExactlyAssociative = NumTraits<T>::IsInteger
+  };
+};
+
+template <typename T>
+struct MeanReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MeanReducer() : scalarCount_(0), packetCount_(0) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
+    scalarCount_++;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
+    (*accum) = padd<Packet>(*accum, p);
+    packetCount_++;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    internal::scalar_cast_op<int, T> conv;
+    return conv(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    internal::scalar_quotient_op<T> quotient_op;
+    return quotient_op(accum, T(scalarCount_));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return pdiv(vaccum, pset1<Packet>(T(packetCount_)));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    internal::scalar_sum_op<T> sum_op;
+    internal::scalar_quotient_op<T> quotient_op;
+    return quotient_op(sum_op(saccum, predux(vaccum)), T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size));
+  }
+
+ protected:
+  DenseIndex scalarCount_;
+  DenseIndex packetCount_;
+};
+
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd && PacketType<T, Device>::HasDiv && !NumTraits<T>::IsInteger,
+    IsStateful = true,
+    IsExactlyAssociative = NumTraits<T>::IsInteger
+  };
+};
+
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits<T>::lowest(); }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return -Eigen::NumTraits<T>::infinity(); }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits<T>::highest(); }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits<T>::infinity(); }
+};
+
+template <typename T, int NaNPropagation = PropagateFast>
+struct MaxReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    scalar_max_op<T, T, NaNPropagation> op;
+    *accum = op(t, *accum);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    scalar_max_op<T, T, NaNPropagation> op;
+    (*accum) = op.packetOp(*accum, p);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return MinMaxBottomValue<T, /*IsMax=*/true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    scalar_max_op<T, T, NaNPropagation> op;
+    return op(saccum, op.predux(vaccum));
+  }
+};
+
+template <typename T, typename Device, int NaNPropagation>
+struct reducer_traits<MaxReducer<T, NaNPropagation>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMax,
+    IsStateful = false,
+    IsExactlyAssociative = (NaNPropagation != PropagateFast)
+  };
+};
+
+template <typename T, int NaNPropagation = PropagateFast>
+struct MinReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    scalar_min_op<T, T, NaNPropagation> op;
+    *accum = op(t, *accum);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    scalar_min_op<T, T, NaNPropagation> op;
+    (*accum) = op.packetOp(*accum, p);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return MinMaxBottomValue<T, /*IsMax=*/false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    scalar_min_op<T, T, NaNPropagation> op;
+    return op(saccum, op.predux(vaccum));
+  }
+};
+
+template <typename T, typename Device, int NaNPropagation>
+struct reducer_traits<MinReducer<T, NaNPropagation>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMin,
+    IsStateful = false,
+    IsExactlyAssociative = (NaNPropagation != PropagateFast)
+  };
+};
+
+template <typename T>
+struct ProdReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    internal::scalar_product_op<T> prod_op;
+    (*accum) = prod_op(*accum, t);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmul<Packet>(*accum, p);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    internal::scalar_cast_op<int, T> conv;
+    return conv(1);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    internal::scalar_product_op<T> prod_op;
+    return prod_op(saccum, predux_mul(vaccum));
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::MulCost,
+    PacketAccess = PacketType<T, Device>::HasMul,
+    IsStateful = false,
+    IsExactlyAssociative = true
+  };
+};
+
+struct AndReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum && t; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { return accum; }
+};
+
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+  enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true };
+};
+
+struct OrReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum || t; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { return false; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { return accum; }
+};
+
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+  enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true };
+};
+
+// Argmin/Argmax reducers.  Returns the first occurrence if multiple locations
+// contain the same min/max value.
+template <typename T>
+struct ArgMaxPairReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t.second < accum->second) {
+      return;
+    } else if (t.second > accum->second || accum->first > t.first) {
+      *accum = t;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return T(0, NumTraits<typename T::second_type>::lowest());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { return accum; }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxPairReducer<T>, Device> {
+  enum { Cost = NumTraits<T>::AddCost, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true };
+};
+
+template <typename T>
+struct ArgMinPairReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
+    if (t.second > accum->second) {
+      return;
+    } else if (t.second < accum->second || accum->first > t.first) {
+      *accum = t;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return T(0, NumTraits<typename T::second_type>::highest());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { return accum; }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ArgMinPairReducer<T>, Device> {
+  enum { Cost = NumTraits<T>::AddCost, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true };
+};
+
+template <typename T, typename Index, size_t NumDims>
+class GaussianGenerator {
+ public:
+  static constexpr bool PacketAccess = false;
+
+  EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means, const array<T, NumDims>& std_devs)
+      : m_means(means) {
+    EIGEN_UNROLL_LOOP
+    for (size_t i = 0; i < NumDims; ++i) {
+      m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
+    T tmp = T(0);
+    EIGEN_UNROLL_LOOP
+    for (size_t i = 0; i < NumDims; ++i) {
+      T offset = coordinates[i] - m_means[i];
+      tmp += offset * offset / m_two_sigmas[i];
+    }
+    return numext::exp(-tmp);
+  }
+
+ private:
+  array<T, NumDims> m_means;
+  array<T, NumDims> m_two_sigmas;
+};
+
+template <typename T, typename Index, size_t NumDims>
+struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
+  enum {
+    Cost = NumDims *
+               (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost + functor_traits<scalar_quotient_op<T, T> >::Cost) +
+           functor_traits<scalar_exp_op<T> >::Cost,
+    PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
+  };
+};
+
+template <typename Scalar>
+struct scalar_clamp_op {
+  EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    return numext::mini(numext::maxi(x, m_min), m_max);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x) const {
+    return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max));
+  }
+  const Scalar m_min;
+  const Scalar m_max;
+};
+template <typename Scalar>
+struct functor_traits<scalar_clamp_op<Scalar> > {
+  enum {
+    Cost = 2 * NumTraits<Scalar>::AddCost,
+    PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)
+  };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
new file mode 100644
index 00000000000..b038eaf787a
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -0,0 +1,271 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorGeneratorOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor generator class.
+ *
+ *
+ */
+namespace internal {
+template <typename Generator, typename XprType>
+struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Generator, typename XprType>
+struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense> {
+  typedef const TensorGeneratorOp<Generator, XprType>& type;
+};
+
+template <typename Generator, typename XprType>
+struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type> {
+  typedef TensorGeneratorOp<Generator, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename Generator, typename XprType>
+class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator)
+      : m_xpr(expr), m_generator(generator) {}
+
+  EIGEN_DEVICE_FUNC const Generator& generator() const { return m_generator; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Generator m_generator;
+};
+
+// Eval as rvalue
+template <typename Generator, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> {
+  typedef TensorGeneratorOp<Generator, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  static constexpr int NumDims = internal::array_size<Dimensions>::value;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = true,
+    PreferBlockAccess = true,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device), m_generator(op.generator()) {
+    TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
+    m_dimensions = argImpl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { return true; }
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    array<Index, NumDims> coords;
+    extract_coordinates(index, coords);
+    return m_generator(coords);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.firstLevelCacheSize();
+    // TODO(ezhulenev): Generator should have a cost.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size);
+  }
+
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    static const bool is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    // Compute spatial coordinates for the first block element.
+    array<Index, NumDims> coords;
+    extract_coordinates(desc.offset(), coords);
+    array<Index, NumDims> initial_coords = coords;
+
+    // Offset in the output block buffer.
+    Index offset = 0;
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = is_col_major ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
+      it[i].span = it[i].stride * (it[i].size - 1);
+      it[i].count = 0;
+    }
+    eigen_assert(it[0].stride == 1);
+
+    // Prepare storage for the materialized generator result.
+    const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    static const int packet_size = PacketType<CoeffReturnType, Device>::size;
+
+    static const int inner_dim = is_col_major ? 0 : NumDims - 1;
+    const Index inner_dim_size = it[0].size;
+    const Index inner_dim_vectorized = inner_dim_size - packet_size;
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      Index i = 0;
+      // Generate data for the vectorized part of the inner-most dimension.
+      for (; i <= inner_dim_vectorized; i += packet_size) {
+        for (Index j = 0; j < packet_size; ++j) {
+          array<Index, NumDims> j_coords = coords;  // Break loop dependence.
+          j_coords[inner_dim] += j;
+          *(block_buffer + offset + i + j) = m_generator(j_coords);
+        }
+        coords[inner_dim] += packet_size;
+      }
+      // Finalize non-vectorized part of the inner-most dimension.
+      for (; i < inner_dim_size; ++i) {
+        *(block_buffer + offset + i) = m_generator(coords);
+        coords[inner_dim]++;
+      }
+      coords[inner_dim] = initial_coords[inner_dim];
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if (NumDims == 1) break;
+
+      // Update offset.
+      for (i = 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          offset += it[i].stride;
+          coords[is_col_major ? i : NumDims - 1 - i]++;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        coords[is_col_major ? i : NumDims - 1 - i] = initial_coords[is_col_major ? i : NumDims - 1 - i];
+        offset -= it[i].span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    // TODO(rmlarsen): This is just a placeholder. Define interface to make
+    // generators return their cost.
+    return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>());
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[NumDims - 1] = index;
+    }
+  }
+
+  const Device EIGEN_DEVICE_REF m_device;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fast_strides;
+  Generator m_generator;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
new file mode 100644
index 00000000000..6a1240cfacd
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
@@ -0,0 +1,36 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ */
+template <typename ADerived, typename BDerived, typename XDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
+                                                                 const ADerived, const BDerived, const XDerived>
+betainc(const Eigen::TensorBase<ADerived, ReadOnlyAccessors>& a,
+        const Eigen::TensorBase<BDerived, ReadOnlyAccessors>& b,
+        const Eigen::TensorBase<XDerived, ReadOnlyAccessors>& x) {
+  return TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived, const BDerived,
+                              const XDerived>(a.derived(), b.derived(), x.derived(),
+                                              internal::scalar_betainc_op<typename XDerived::Scalar>());
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
new file mode 100644
index 00000000000..3073272a24f
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -0,0 +1,101 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
+// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
+// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
+// When compiling such files, gcc will end up trying to pick up the CUDA headers by
+// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
+// To work around this issue for HIP systems (and leave the default behaviour intact), the
+// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
+// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
+// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
+
+#if defined(EIGEN_USE_HIP)
+
+#define gpuStream_t hipStream_t
+#define gpuDeviceProp_t hipDeviceProp_t
+#define gpuError_t hipError_t
+#define gpuSuccess hipSuccess
+#define gpuErrorNotReady hipErrorNotReady
+#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetLastError hipGetLastError
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorName hipGetErrorName
+#define gpuGetErrorString hipGetErrorString
+#define gpuGetDeviceProperties hipGetDeviceProperties
+#define gpuStreamDefault hipStreamDefault
+#define gpuGetDevice hipGetDevice
+#define gpuSetDevice hipSetDevice
+#define gpuMalloc hipMalloc
+#define gpuFree hipFree
+#define gpuMemsetAsync hipMemsetAsync
+#define gpuMemset2DAsync hipMemset2DAsync
+#define gpuMemcpyAsync hipMemcpyAsync
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuStreamQuery hipStreamQuery
+#define gpuSharedMemConfig hipSharedMemConfig
+#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
+#define gpuStreamSynchronize hipStreamSynchronize
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuMemcpy hipMemcpy
+
+#else
+
+#define gpuStream_t cudaStream_t
+#define gpuDeviceProp_t cudaDeviceProp
+#define gpuError_t cudaError_t
+#define gpuSuccess cudaSuccess
+#define gpuErrorNotReady cudaErrorNotReady
+#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetLastError cudaGetLastError
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorName cudaGetErrorName
+#define gpuGetErrorString cudaGetErrorString
+#define gpuGetDeviceProperties cudaGetDeviceProperties
+#define gpuStreamDefault cudaStreamDefault
+#define gpuGetDevice cudaGetDevice
+#define gpuSetDevice cudaSetDevice
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
+#define gpuMemsetAsync cudaMemsetAsync
+#define gpuMemset2DAsync cudaMemset2DAsync
+#define gpuMemcpyAsync cudaMemcpyAsync
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuStreamQuery cudaStreamQuery
+#define gpuSharedMemConfig cudaSharedMemConfig
+#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
+#define gpuStreamSynchronize cudaStreamSynchronize
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuMemcpy cudaMemcpy
+
+#endif
+
+// gpu_assert can be overridden
+#ifndef gpu_assert
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+// HIPCC do not support the use of assert on the GPU side.
+#define gpu_assert(COND)
+#else
+#define gpu_assert(COND) eigen_assert(COND)
+#endif
+
+#endif  // gpu_assert
+
+#endif  // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
new file mode 100644
index 00000000000..509bcee0b87
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
@@ -0,0 +1,45 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+
+#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef gpuStream_t
+#undef gpuDeviceProp_t
+#undef gpuError_t
+#undef gpuSuccess
+#undef gpuErrorNotReady
+#undef gpuGetDeviceCount
+#undef gpuGetErrorString
+#undef gpuGetDeviceProperties
+#undef gpuStreamDefault
+#undef gpuGetDevice
+#undef gpuSetDevice
+#undef gpuMalloc
+#undef gpuFree
+#undef gpuMemsetAsync
+#undef gpuMemset2DAsync
+#undef gpuMemcpyAsync
+#undef gpuMemcpyDeviceToDevice
+#undef gpuMemcpyDeviceToHost
+#undef gpuMemcpyHostToDevice
+#undef gpuStreamQuery
+#undef gpuSharedMemConfig
+#undef gpuDeviceSetSharedMemConfig
+#undef gpuStreamSynchronize
+#undef gpuDeviceSynchronize
+#undef gpuMemcpy
+
+#endif  // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+#endif  // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
new file mode 100644
index 00000000000..0bdb1ab7985
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -0,0 +1,413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+struct TensorIOFormat;
+
+namespace internal {
+template <typename Tensor, std::size_t rank, typename Format, typename EnableIf = void>
+struct TensorPrinter;
+}
+
+template <typename Derived_>
+struct TensorIOFormatBase {
+  using Derived = Derived_;
+  TensorIOFormatBase(const std::vector<std::string>& separator, const std::vector<std::string>& prefix,
+                     const std::vector<std::string>& suffix, int precision = StreamPrecision, int flags = 0,
+                     const std::string& tenPrefix = "", const std::string& tenSuffix = "", const char fill = ' ')
+      : tenPrefix(tenPrefix),
+        tenSuffix(tenSuffix),
+        prefix(prefix),
+        suffix(suffix),
+        separator(separator),
+        fill(fill),
+        precision(precision),
+        flags(flags) {
+    init_spacer();
+  }
+
+  void init_spacer() {
+    if ((flags & DontAlignCols)) return;
+    spacer.resize(prefix.size());
+    spacer[0] = "";
+    int i = int(tenPrefix.length()) - 1;
+    while (i >= 0 && tenPrefix[i] != '\n') {
+      spacer[0] += ' ';
+      i--;
+    }
+
+    for (std::size_t k = 1; k < prefix.size(); k++) {
+      int j = int(prefix[k].length()) - 1;
+      while (j >= 0 && prefix[k][j] != '\n') {
+        spacer[k] += ' ';
+        j--;
+      }
+    }
+  }
+
+  std::string tenPrefix;
+  std::string tenSuffix;
+  std::vector<std::string> prefix;
+  std::vector<std::string> suffix;
+  std::vector<std::string> separator;
+  char fill;
+  int precision;
+  int flags;
+  std::vector<std::string> spacer{};
+};
+
+struct TensorIOFormatNumpy : public TensorIOFormatBase<TensorIOFormatNumpy> {
+  using Base = TensorIOFormatBase<TensorIOFormatNumpy>;
+  TensorIOFormatNumpy()
+      : Base(/*separator=*/{" ", "\n"}, /*prefix=*/{"", "["}, /*suffix=*/{"", "]"}, /*precision=*/StreamPrecision,
+             /*flags=*/0, /*tenPrefix=*/"[", /*tenSuffix=*/"]") {}
+};
+
+struct TensorIOFormatNative : public TensorIOFormatBase<TensorIOFormatNative> {
+  using Base = TensorIOFormatBase<TensorIOFormatNative>;
+  TensorIOFormatNative()
+      : Base(/*separator=*/{", ", ",\n", "\n"}, /*prefix=*/{"", "{"}, /*suffix=*/{"", "}"},
+             /*precision=*/StreamPrecision, /*flags=*/0, /*tenPrefix=*/"{", /*tenSuffix=*/"}") {}
+};
+
+struct TensorIOFormatPlain : public TensorIOFormatBase<TensorIOFormatPlain> {
+  using Base = TensorIOFormatBase<TensorIOFormatPlain>;
+  TensorIOFormatPlain()
+      : Base(/*separator=*/{" ", "\n", "\n", ""}, /*prefix=*/{""}, /*suffix=*/{""}, /*precision=*/StreamPrecision,
+             /*flags=*/0, /*tenPrefix=*/"", /*tenSuffix=*/"") {}
+};
+
+struct TensorIOFormatLegacy : public TensorIOFormatBase<TensorIOFormatLegacy> {
+  using Base = TensorIOFormatBase<TensorIOFormatLegacy>;
+  TensorIOFormatLegacy()
+      : Base(/*separator=*/{", ", "\n"}, /*prefix=*/{"", "["}, /*suffix=*/{"", "]"}, /*precision=*/StreamPrecision,
+             /*flags=*/0, /*tenPrefix=*/"", /*tenSuffix=*/"") {}
+};
+
+struct TensorIOFormat : public TensorIOFormatBase<TensorIOFormat> {
+  using Base = TensorIOFormatBase<TensorIOFormat>;
+  TensorIOFormat(const std::vector<std::string>& separator, const std::vector<std::string>& prefix,
+                 const std::vector<std::string>& suffix, int precision = StreamPrecision, int flags = 0,
+                 const std::string& tenPrefix = "", const std::string& tenSuffix = "", const char fill = ' ')
+      : Base(separator, prefix, suffix, precision, flags, tenPrefix, tenSuffix, fill) {}
+
+  static inline const TensorIOFormatNumpy Numpy() { return TensorIOFormatNumpy{}; }
+
+  static inline const TensorIOFormatPlain Plain() { return TensorIOFormatPlain{}; }
+
+  static inline const TensorIOFormatNative Native() { return TensorIOFormatNative{}; }
+
+  static inline const TensorIOFormatLegacy Legacy() { return TensorIOFormatLegacy{}; }
+};
+
+template <typename T, int Layout, int rank, typename Format>
+class TensorWithFormat;
+// specialize for Layout=ColMajor, Layout=RowMajor and rank=0.
+template <typename T, int rank, typename Format>
+class TensorWithFormat<T, RowMajor, rank, Format> {
+ public:
+  TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat<T, RowMajor, rank, Format>& wf) {
+    // Evaluate the expression if needed
+    typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+    TensorForcedEvalOp<const T> eval = wf.t_tensor.eval();
+    Evaluator tensor(eval, DefaultDevice());
+    tensor.evalSubExprsIfNeeded(NULL);
+    internal::TensorPrinter<Evaluator, rank, Format>::run(os, tensor, wf.t_format);
+    // Cleanup.
+    tensor.cleanup();
+    return os;
+  }
+
+ protected:
+  T t_tensor;
+  Format t_format;
+};
+
+template <typename T, int rank, typename Format>
+class TensorWithFormat<T, ColMajor, rank, Format> {
+ public:
+  TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat<T, ColMajor, rank, Format>& wf) {
+    // Switch to RowMajor storage and print afterwards
+    typedef typename T::Index IndexType;
+    std::array<IndexType, rank> shuffle;
+    std::array<IndexType, rank> id;
+    std::iota(id.begin(), id.end(), IndexType(0));
+    std::copy(id.begin(), id.end(), shuffle.rbegin());
+    auto tensor_row_major = wf.t_tensor.swap_layout().shuffle(shuffle);
+
+    // Evaluate the expression if needed
+    typedef TensorEvaluator<const TensorForcedEvalOp<const decltype(tensor_row_major)>, DefaultDevice> Evaluator;
+    TensorForcedEvalOp<const decltype(tensor_row_major)> eval = tensor_row_major.eval();
+    Evaluator tensor(eval, DefaultDevice());
+    tensor.evalSubExprsIfNeeded(NULL);
+    internal::TensorPrinter<Evaluator, rank, Format>::run(os, tensor, wf.t_format);
+    // Cleanup.
+    tensor.cleanup();
+    return os;
+  }
+
+ protected:
+  T t_tensor;
+  Format t_format;
+};
+
+template <typename T, typename Format>
+class TensorWithFormat<T, ColMajor, 0, Format> {
+ public:
+  TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat<T, ColMajor, 0, Format>& wf) {
+    // Evaluate the expression if needed
+    typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+    TensorForcedEvalOp<const T> eval = wf.t_tensor.eval();
+    Evaluator tensor(eval, DefaultDevice());
+    tensor.evalSubExprsIfNeeded(NULL);
+    internal::TensorPrinter<Evaluator, 0, Format>::run(os, tensor, wf.t_format);
+    // Cleanup.
+    tensor.cleanup();
+    return os;
+  }
+
+ protected:
+  T t_tensor;
+  Format t_format;
+};
+
+namespace internal {
+
+// Default scalar printer.
+template <typename Scalar, typename Format, typename EnableIf = void>
+struct ScalarPrinter {
+  static void run(std::ostream& stream, const Scalar& scalar, const Format&) { stream << scalar; }
+};
+
+template <typename Scalar>
+struct ScalarPrinter<Scalar, TensorIOFormatNumpy, std::enable_if_t<NumTraits<Scalar>::IsComplex>> {
+  static void run(std::ostream& stream, const Scalar& scalar, const TensorIOFormatNumpy&) {
+    stream << numext::real(scalar) << "+" << numext::imag(scalar) << "j";
+  }
+};
+
+template <typename Scalar>
+struct ScalarPrinter<Scalar, TensorIOFormatNative, std::enable_if_t<NumTraits<Scalar>::IsComplex>> {
+  static void run(std::ostream& stream, const Scalar& scalar, const TensorIOFormatNative&) {
+    stream << "{" << numext::real(scalar) << ", " << numext::imag(scalar) << "}";
+  }
+};
+
+template <typename Tensor, std::size_t rank, typename Format, typename EnableIf>
+struct TensorPrinter {
+  using Scalar = std::remove_const_t<typename Tensor::Scalar>;
+
+  static void run(std::ostream& s, const Tensor& tensor, const Format& fmt) {
+    typedef typename Tensor::Index IndexType;
+
+    eigen_assert(Tensor::Layout == RowMajor);
+    typedef std::conditional_t<is_same<Scalar, char>::value || is_same<Scalar, unsigned char>::value ||
+                                   is_same<Scalar, numext::int8_t>::value || is_same<Scalar, numext::uint8_t>::value,
+                               int,
+                               std::conditional_t<is_same<Scalar, std::complex<char>>::value ||
+                                                      is_same<Scalar, std::complex<unsigned char>>::value ||
+                                                      is_same<Scalar, std::complex<numext::int8_t>>::value ||
+                                                      is_same<Scalar, std::complex<numext::uint8_t>>::value,
+                                                  std::complex<int>, const Scalar&>>
+        PrintType;
+
+    const IndexType total_size = array_prod(tensor.dimensions());
+
+    std::streamsize explicit_precision;
+    if (fmt.precision == StreamPrecision) {
+      explicit_precision = 0;
+    } else if (fmt.precision == FullPrecision) {
+      if (NumTraits<Scalar>::IsInteger) {
+        explicit_precision = 0;
+      } else {
+        explicit_precision = significant_decimals_impl<Scalar>::run();
+      }
+    } else {
+      explicit_precision = fmt.precision;
+    }
+
+    std::streamsize old_precision = 0;
+    if (explicit_precision) old_precision = s.precision(explicit_precision);
+
+    IndexType width = 0;
+    bool align_cols = !(fmt.flags & DontAlignCols);
+    if (align_cols) {
+      // compute the largest width
+      for (IndexType i = 0; i < total_size; i++) {
+        std::stringstream sstr;
+        sstr.copyfmt(s);
+        ScalarPrinter<Scalar, Format>::run(sstr, static_cast<PrintType>(tensor.data()[i]), fmt);
+        width = std::max<IndexType>(width, IndexType(sstr.str().length()));
+      }
+    }
+    s << fmt.tenPrefix;
+    for (IndexType i = 0; i < total_size; i++) {
+      std::array<bool, rank> is_at_end{};
+      std::array<bool, rank> is_at_begin{};
+
+      // is the ith element the end of an coeff (always true), of a row, of a matrix, ...?
+      for (std::size_t k = 0; k < rank; k++) {
+        if ((i + 1) % (std::accumulate(tensor.dimensions().rbegin(), tensor.dimensions().rbegin() + k, 1,
+                                       std::multiplies<IndexType>())) ==
+            0) {
+          is_at_end[k] = true;
+        }
+      }
+
+      // is the ith element the begin of an coeff (always true), of a row, of a matrix, ...?
+      for (std::size_t k = 0; k < rank; k++) {
+        if (i % (std::accumulate(tensor.dimensions().rbegin(), tensor.dimensions().rbegin() + k, 1,
+                                 std::multiplies<IndexType>())) ==
+            0) {
+          is_at_begin[k] = true;
+        }
+      }
+
+      // do we have a line break?
+      bool is_at_begin_after_newline = false;
+      for (std::size_t k = 0; k < rank; k++) {
+        if (is_at_begin[k]) {
+          std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1;
+          if (fmt.separator[separator_index].find('\n') != std::string::npos) {
+            is_at_begin_after_newline = true;
+          }
+        }
+      }
+
+      bool is_at_end_before_newline = false;
+      for (std::size_t k = 0; k < rank; k++) {
+        if (is_at_end[k]) {
+          std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1;
+          if (fmt.separator[separator_index].find('\n') != std::string::npos) {
+            is_at_end_before_newline = true;
+          }
+        }
+      }
+
+      std::stringstream suffix, prefix, separator;
+      for (std::size_t k = 0; k < rank; k++) {
+        std::size_t suffix_index = (k < fmt.suffix.size()) ? k : fmt.suffix.size() - 1;
+        if (is_at_end[k]) {
+          suffix << fmt.suffix[suffix_index];
+        }
+      }
+      for (std::size_t k = 0; k < rank; k++) {
+        std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1;
+        if (is_at_end[k] &&
+            (!is_at_end_before_newline || fmt.separator[separator_index].find('\n') != std::string::npos)) {
+          separator << fmt.separator[separator_index];
+        }
+      }
+      for (std::size_t k = 0; k < rank; k++) {
+        std::size_t spacer_index = (k < fmt.spacer.size()) ? k : fmt.spacer.size() - 1;
+        if (i != 0 && is_at_begin_after_newline && (!is_at_begin[k] || k == 0)) {
+          prefix << fmt.spacer[spacer_index];
+        }
+      }
+      for (int k = rank - 1; k >= 0; k--) {
+        std::size_t prefix_index = (static_cast<std::size_t>(k) < fmt.prefix.size()) ? k : fmt.prefix.size() - 1;
+        if (is_at_begin[k]) {
+          prefix << fmt.prefix[prefix_index];
+        }
+      }
+
+      s << prefix.str();
+      // So we don't mess around with formatting, output scalar to a string stream, and adjust the width/fill manually.
+      std::stringstream sstr;
+      sstr.copyfmt(s);
+      ScalarPrinter<Scalar, Format>::run(sstr, static_cast<PrintType>(tensor.data()[i]), fmt);
+      std::string scalar_str = sstr.str();
+      IndexType scalar_width = scalar_str.length();
+      if (width && scalar_width < width) {
+        std::string filler;
+        for (IndexType j = scalar_width; j < width; ++j) {
+          filler.push_back(fmt.fill);
+        }
+        s << filler;
+      }
+      s << scalar_str;
+      s << suffix.str();
+      if (i < total_size - 1) {
+        s << separator.str();
+      }
+    }
+    s << fmt.tenSuffix;
+    if (explicit_precision) s.precision(old_precision);
+  }
+};
+
+template <typename Tensor, std::size_t rank>
+struct TensorPrinter<Tensor, rank, TensorIOFormatLegacy, std::enable_if_t<rank != 0>> {
+  using Format = TensorIOFormatLegacy;
+  using Scalar = std::remove_const_t<typename Tensor::Scalar>;
+
+  static void run(std::ostream& s, const Tensor& tensor, const Format&) {
+    typedef typename Tensor::Index IndexType;
+    // backwards compatibility case: print tensor after reshaping to matrix of size dim(0) x
+    // (dim(1)*dim(2)*...*dim(rank-1)).
+    const IndexType total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      const IndexType first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+      Map<const Array<Scalar, Dynamic, Dynamic, Tensor::Layout>> matrix(tensor.data(), first_dim,
+                                                                        total_size / first_dim);
+      s << matrix;
+      return;
+    }
+  }
+};
+
+template <typename Tensor, typename Format>
+struct TensorPrinter<Tensor, 0, Format> {
+  static void run(std::ostream& s, const Tensor& tensor, const Format& fmt) {
+    using Scalar = std::remove_const_t<typename Tensor::Scalar>;
+
+    std::streamsize explicit_precision;
+    if (fmt.precision == StreamPrecision) {
+      explicit_precision = 0;
+    } else if (fmt.precision == FullPrecision) {
+      if (NumTraits<Scalar>::IsInteger) {
+        explicit_precision = 0;
+      } else {
+        explicit_precision = significant_decimals_impl<Scalar>::run();
+      }
+    } else {
+      explicit_precision = fmt.precision;
+    }
+
+    std::streamsize old_precision = 0;
+    if (explicit_precision) old_precision = s.precision(explicit_precision);
+    s << fmt.tenPrefix;
+    ScalarPrinter<Scalar, Format>::run(s, tensor.coeff(0), fmt);
+    s << fmt.tenSuffix;
+    if (explicit_precision) s.precision(old_precision);
+  }
+};
+
+}  // end namespace internal
+template <typename T>
+std::ostream& operator<<(std::ostream& s, const TensorBase<T, ReadOnlyAccessors>& t) {
+  s << t.format(TensorIOFormat::Plain());
+  return s;
+}
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_IO_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
new file mode 100644
index 00000000000..291301a8193
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -0,0 +1,590 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorImagePatch
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Patch extraction specialized for image processing.
+ * This assumes that the input has a least 3 dimensions ordered as follow:
+ *  1st dimension: channels (of size d)
+ *  2nd dimension: rows (of size r)
+ *  3rd dimension: columns (of size c)
+ *  There can be additional dimensions such as time (for video) or batch (for
+ * bulk processing after the first 3.
+ * Calling the image patch code with patch_rows and patch_cols is equivalent
+ * to calling the regular patch extraction code with parameters d, patch_rows,
+ * patch_cols, and 1 for all the additional dimensions.
+ */
+namespace internal {
+
+template <DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType> {
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions + 1;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense> {
+  typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
+};
+
+template <DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1,
+              typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type> {
+  typedef TensorImagePatchOp<Rows, Cols, XprType> type;
+};
+
+template <typename Self, bool Vectorizable>
+struct ImagePatchCopyOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Self& self, const Index num_coeff_to_copy,
+                                                        const Index dst_index, Scalar* dst_data,
+                                                        const Index src_index) {
+    const Impl& impl = self.impl();
+    for (Index i = 0; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchCopyOp<Self, true> {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Self& self, const Index num_coeff_to_copy,
+                                                        const Index dst_index, Scalar* dst_data,
+                                                        const Index src_index) {
+    const Impl& impl = self.impl();
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Index vectorized_size = (num_coeff_to_copy / packet_size) * packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      Packet p = impl.template packet<Unaligned>(src_index + i);
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchPaddingOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Index num_coeff_to_pad, const Scalar padding_value,
+                                                        const Index dst_index, Scalar* dst_data) {
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Packet padded_packet = internal::pset1<Packet>(padding_value);
+    const Index vectorized_size = (num_coeff_to_pad / packet_size) * packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, padded_packet);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
+      dst_data[dst_index + i] = padding_value;
+    }
+  }
+};
+
+}  // end namespace internal
+
+template <DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows,
+                                                           DenseIndex patch_cols, DenseIndex row_strides,
+                                                           DenseIndex col_strides, DenseIndex in_row_strides,
+                                                           DenseIndex in_col_strides, DenseIndex row_inflate_strides,
+                                                           DenseIndex col_inflate_strides, PaddingType padding_type,
+                                                           Scalar padding_value)
+      : m_xpr(expr),
+        m_patch_rows(patch_rows),
+        m_patch_cols(patch_cols),
+        m_row_strides(row_strides),
+        m_col_strides(col_strides),
+        m_in_row_strides(in_row_strides),
+        m_in_col_strides(in_col_strides),
+        m_row_inflate_strides(row_inflate_strides),
+        m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false),
+        m_padding_top(0),
+        m_padding_bottom(0),
+        m_padding_left(0),
+        m_padding_right(0),
+        m_padding_type(padding_type),
+        m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows,
+                                                           DenseIndex patch_cols, DenseIndex row_strides,
+                                                           DenseIndex col_strides, DenseIndex in_row_strides,
+                                                           DenseIndex in_col_strides, DenseIndex row_inflate_strides,
+                                                           DenseIndex col_inflate_strides, DenseIndex padding_top,
+                                                           DenseIndex padding_bottom, DenseIndex padding_left,
+                                                           DenseIndex padding_right, Scalar padding_value)
+      : m_xpr(expr),
+        m_patch_rows(patch_rows),
+        m_patch_cols(patch_cols),
+        m_row_strides(row_strides),
+        m_col_strides(col_strides),
+        m_in_row_strides(in_row_strides),
+        m_in_col_strides(in_col_strides),
+        m_row_inflate_strides(row_inflate_strides),
+        m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true),
+        m_padding_top(padding_top),
+        m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left),
+        m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID),
+        m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; }
+  EIGEN_DEVICE_FUNC DenseIndex patch_cols() const { return m_patch_cols; }
+  EIGEN_DEVICE_FUNC DenseIndex row_strides() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex col_strides() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex in_row_strides() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex in_col_strides() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+  EIGEN_DEVICE_FUNC bool padding_explicit() const { return m_padding_explicit; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_top() const { return m_padding_top; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_bottom() const { return m_padding_bottom; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_left() const { return m_padding_left; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_right() const { return m_padding_right; }
+  EIGEN_DEVICE_FUNC PaddingType padding_type() const { return m_padding_type; }
+  EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const DenseIndex m_patch_rows;
+  const DenseIndex m_patch_cols;
+  const DenseIndex m_row_strides;
+  const DenseIndex m_col_strides;
+  const DenseIndex m_in_row_strides;
+  const DenseIndex m_in_col_strides;
+  const DenseIndex m_row_inflate_strides;
+  const DenseIndex m_col_inflate_strides;
+  const bool m_padding_explicit;
+  const DenseIndex m_padding_top;
+  const DenseIndex m_padding_bottom;
+  const DenseIndex m_padding_left;
+  const DenseIndex m_padding_right;
+  const PaddingType m_padding_type;
+  const Scalar m_padding_value;
+};
+
+// Eval as rvalue
+template <DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> {
+  typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static constexpr int NumDims = NumInputDims + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> Self;
+  typedef TensorEvaluator<ArgType, Device> Impl;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = true,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device), m_impl(op.expression(), device) {
+    EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    m_paddingValue = op.padding_value();
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // Caches a few variables.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputDepth = input_dims[0];
+      m_inputRows = input_dims[1];
+      m_inputCols = input_dims[2];
+    } else {
+      m_inputDepth = input_dims[NumInputDims - 1];
+      m_inputRows = input_dims[NumInputDims - 2];
+      m_inputCols = input_dims[NumInputDims - 3];
+    }
+
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // Input strides and effective input/patch size
+    m_in_row_strides = op.in_row_strides();
+    m_in_col_strides = op.in_col_strides();
+    m_row_inflate_strides = op.row_inflate_strides();
+    m_col_inflate_strides = op.col_inflate_strides();
+    // The "effective" input rows and input cols are the input rows and cols
+    // after inflating them with zeros.
+    // For examples, a 2x3 matrix with row_inflate_strides and
+    // col_inflate_strides of 2 comes from:
+    //   A B C
+    //   D E F
+    //
+    // to a matrix is 3 x 5:
+    //
+    //   A . B . C
+    //   . . . . .
+    //   D . E . F
+
+    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+    if (op.padding_explicit()) {
+      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) /
+                                  static_cast<float>(m_row_strides));
+      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) /
+                                  static_cast<float>(m_col_strides));
+      m_rowPaddingTop = op.padding_top();
+      m_colPaddingLeft = op.padding_left();
+    } else {
+      // Computing padding from the type
+      switch (op.padding_type()) {
+        case PADDING_VALID:
+          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+          // Calculate the padding
+          m_rowPaddingTop =
+              numext::maxi<Index>(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2);
+          m_colPaddingLeft =
+              numext::maxi<Index>(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2);
+          break;
+        case PADDING_SAME:
+          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+          // Calculate the padding
+          m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
+          m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
+          // The padding size calculation for PADDING_SAME has been updated to
+          // be consistent with how TensorFlow extracts its paddings.
+          m_rowPaddingTop = numext::maxi<Index>(0, m_rowPaddingTop);
+          m_colPaddingLeft = numext::maxi<Index>(0, m_colPaddingLeft);
+          break;
+        default:
+          eigen_assert(false && "unexpected padding");
+          m_outputCols = 0;  // silence the uninitialised warning;
+          m_outputRows = 0;  //// silence the uninitialised warning;
+      }
+    }
+    eigen_assert(m_outputRows > 0);
+    eigen_assert(m_outputCols > 0);
+
+    // Dimensions for result of extraction.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // ColMajor
+      // 0: depth
+      // 1: patch_rows
+      // 2: patch_cols
+      // 3: number of patches
+      // 4 and beyond: anything else (such as batch).
+      m_dimensions[0] = input_dims[0];
+      m_dimensions[1] = op.patch_rows();
+      m_dimensions[2] = op.patch_cols();
+      m_dimensions[3] = m_outputRows * m_outputCols;
+      for (int i = 4; i < NumDims; ++i) {
+        m_dimensions[i] = input_dims[i - 1];
+      }
+    } else {
+      // RowMajor
+      // NumDims-1: depth
+      // NumDims-2: patch_rows
+      // NumDims-3: patch_cols
+      // NumDims-4: number of patches
+      // NumDims-5 and beyond: anything else (such as batch).
+      m_dimensions[NumDims - 1] = input_dims[NumInputDims - 1];
+      m_dimensions[NumDims - 2] = op.patch_rows();
+      m_dimensions[NumDims - 3] = op.patch_cols();
+      m_dimensions[NumDims - 4] = m_outputRows * m_outputCols;
+      for (int i = NumDims - 5; i >= 0; --i) {
+        m_dimensions[i] = input_dims[i];
+      }
+    }
+
+    // Strides for moving the patch in various dimensions.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_colStride = m_dimensions[1];
+      m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
+      m_otherStride = m_patchStride * m_dimensions[3];
+    } else {
+      m_colStride = m_dimensions[NumDims - 2];
+      m_patchStride = m_colStride * m_dimensions[NumDims - 3] * m_dimensions[NumDims - 1];
+      m_otherStride = m_patchStride * m_dimensions[NumDims - 4];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_rowInputStride = m_inputDepth;
+    m_colInputStride = m_inputDepth * m_inputRows;
+    m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols;
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastInflateRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+    m_fastInflateColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+
+    // Number of patches in the width dimension.
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+    } else {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims - 1]);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    // Other ways to index this element.
+    const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
+    const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    // Calculate col index in the input original tensor.
+    const Index colIndex = patch2DIndex / m_fastOutputRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol =
+        (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate row index in the original input tensor.
+    const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow =
+        (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+    const Index inputIndex =
+        depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride;
+    return m_impl.coeff(inputIndex);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index indices[2] = {index, index + PacketSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+    const Index patch2DIndex =
+        (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch2DIndex / m_fastOutputRows;
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
+                                colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputCols[0] == inputCols[1]) {
+      const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+      const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride,
+                                   patchOffsets[1] - colOffsets[1] * m_colStride};
+      eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+      // Calculate col indices in the original input tensor.
+      const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
+                                  rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+      if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+        return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+      }
+
+      if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+        // no padding
+        const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+        const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+        const Index inputIndex =
+            depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
+        return m_impl.template packet<Unaligned>(inputIndex);
+      }
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // We conservatively estimate the cost for the code path where the computed
+    // index is inside the original image and
+    // TensorEvaluator<ArgType, Device>::CoordAccess is false.
+    const double compute_cost =
+        3 * TensorOpCost::DivCost<Index>() + 6 * TensorOpCost::MulCost<Index>() + 8 * TensorOpCost::MulCost<Index>();
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const {
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_colStride;
+  Index m_row_strides;
+  Index m_col_strides;
+
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+  Index m_row_inflate_strides;
+  Index m_col_inflate_strides;
+
+  Index m_input_rows_eff;
+  Index m_input_cols_eff;
+  Index m_patch_rows_eff;
+  Index m_patch_cols_eff;
+
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+  internal::TensorIntDivisor<Index> m_fastInflateRowStride;
+  internal::TensorIntDivisor<Index> m_fastInflateColStride;
+  internal::TensorIntDivisor<Index> m_fastInputColsEff;
+
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_patchInputStride;
+
+  Index m_inputDepth;
+  Index m_inputRows;
+  Index m_inputCols;
+
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+  Scalar m_paddingValue;
+
+  const Device EIGEN_DEVICE_REF m_device;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
new file mode 100644
index 00000000000..69ab6840c82
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -0,0 +1,620 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorIndexList
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Set of classes used to encode a set of Tensor dimensions/indices.
+ *
+ * The indices in the list can be known at compile time or at runtime. A mix
+ * of static and dynamic indices can also be provided if needed. The tensor
+ * code will attempt to take advantage of the indices that are known at
+ * compile time to optimize the code it generates.
+ *
+ * This functionality requires a c++11 compliant compiler. If your compiler
+ * is older you need to use arrays of indices instead.
+ *
+ * Several examples are provided in the cxx11_tensor_index_list.cpp file.
+ *
+ * \sa Tensor
+ */
+
+template <Index n>
+struct type2index {
+  static constexpr Index value = n;
+  EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; }
+  EIGEN_DEVICE_FUNC void set(Index val) { eigen_assert(val == n); }
+};
+
+// This can be used with IndexPairList to get compile-time constant pairs,
+// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
+template <Index f, Index s>
+struct type2indexpair {
+  static constexpr Index first = f;
+  static constexpr Index second = s;
+
+  constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const { return IndexPair<Index>(f, s); }
+
+  EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) {
+    eigen_assert(val.first == f);
+    eigen_assert(val.second == s);
+  }
+};
+
+template <Index n>
+struct NumTraits<type2index<n>> {
+  typedef Index Real;
+  enum { IsComplex = 0, RequireInitialization = false, ReadCost = 1, AddCost = 1, MulCost = 1 };
+
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; }
+};
+
+namespace internal {
+template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) {
+  val = internal::convert_index<T>(new_val);
+}
+template <Index n>
+EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, Index new_val) {
+  val.set(new_val);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<Index> new_val) {
+  val = new_val;
+}
+template <Index f, Index s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<Index> new_val) {
+  val.set(new_val);
+}
+
+template <typename T>
+struct is_compile_time_constant {
+  static constexpr bool value = false;
+};
+
+template <Index idx>
+struct is_compile_time_constant<type2index<idx>> {
+  static constexpr bool value = true;
+};
+template <Index idx>
+struct is_compile_time_constant<const type2index<idx>> {
+  static constexpr bool value = true;
+};
+template <Index idx>
+struct is_compile_time_constant<type2index<idx>&> {
+  static constexpr bool value = true;
+};
+template <Index idx>
+struct is_compile_time_constant<const type2index<idx>&> {
+  static constexpr bool value = true;
+};
+
+template <Index f, Index s>
+struct is_compile_time_constant<type2indexpair<f, s>> {
+  static constexpr bool value = true;
+};
+template <Index f, Index s>
+struct is_compile_time_constant<const type2indexpair<f, s>> {
+  static constexpr bool value = true;
+};
+template <Index f, Index s>
+struct is_compile_time_constant<type2indexpair<f, s>&> {
+  static constexpr bool value = true;
+};
+template <Index f, Index s>
+struct is_compile_time_constant<const type2indexpair<f, s>&> {
+  static constexpr bool value = true;
+};
+
+template <typename... T>
+struct IndexTuple;
+
+template <typename T, typename... O>
+struct IndexTuple<T, O...> {
+  EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() {}
+  EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) {}
+
+  static constexpr int count = 1 + sizeof...(O);
+  T head;
+  IndexTuple<O...> others;
+  typedef T Head;
+  typedef IndexTuple<O...> Other;
+};
+
+template <typename T>
+struct IndexTuple<T> {
+  EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() {}
+  EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) {}
+
+  constexpr static int count = 1;
+  T head;
+  typedef T Head;
+};
+
+template <int N, typename... T>
+struct IndexTupleExtractor;
+
+template <int N, typename T, typename... O>
+struct IndexTupleExtractor<N, T, O...> {
+  typedef typename IndexTupleExtractor<N - 1, O...>::ValType ValType;
+
+  EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
+    return IndexTupleExtractor<N - 1, O...>::get_val(val.others);
+  }
+
+  EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
+    return IndexTupleExtractor<N - 1, O...>::get_val(val.others);
+  }
+  template <typename V>
+  EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+    IndexTupleExtractor<N - 1, O...>::set_val(val.others, new_val);
+  }
+};
+
+template <typename T, typename... O>
+struct IndexTupleExtractor<0, T, O...> {
+  typedef T ValType;
+
+  EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) { return val.head; }
+  EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) { return val.head; }
+  template <typename V>
+  EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+    val.head = new_val;
+  }
+};
+
+template <int N, typename T, typename... O>
+EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor<N, T, O...>::ValType& array_get(IndexTuple<T, O...>& tuple) {
+  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <int N, typename T, typename... O>
+EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor<N, T, O...>::ValType& array_get(
+    const IndexTuple<T, O...>& tuple) {
+  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <typename T, typename... O>
+struct array_size<IndexTuple<T, O...>> {
+  static constexpr size_t value = IndexTuple<T, O...>::count;
+};
+template <typename T, typename... O>
+struct array_size<const IndexTuple<T, O...>> {
+  static constexpr size_t value = IndexTuple<T, O...>::count;
+};
+
+template <Index Idx, typename ValueT>
+struct tuple_coeff {
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) {
+    //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+    return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx - 1, ValueT>::get(i, t));
+  }
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT& value) {
+    if (i == Idx) {
+      update_value(array_get<Idx>(t), value);
+    } else {
+      tuple_coeff<Idx - 1, ValueT>::set(i, t, value);
+    }
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) {
+    return ((i == Idx) && is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
+           tuple_coeff<Idx - 1, ValueT>::value_known_statically(i, t);
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
+    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           tuple_coeff<Idx - 1, ValueT>::values_up_to_known_statically(t);
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>& t) {
+    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           array_get<Idx>(t) > array_get<Idx - 1>(t) &&
+           tuple_coeff<Idx - 1, ValueT>::values_up_to_statically_known_to_increase(t);
+  }
+};
+
+template <typename ValueT>
+struct tuple_coeff<0, ValueT> {
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) {
+    //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
+    return array_get<0>(t) /* * (i == 0)*/;
+  }
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) {
+    eigen_assert(i == 0);
+    update_value(array_get<0>(t), value);
+  }
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value && (i == 0);
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value;
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>&) {
+    return true;
+  }
+};
+}  // namespace internal
+
+template <typename FirstType, typename... OtherTypes>
+struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[](const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::set(i, *this, value);
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr std::size_t size() const { return 1 + sizeof...(OtherTypes); };
+
+  EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other)
+      : internal::IndexTuple<FirstType, OtherTypes...>(other) {}
+  EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other)
+      : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) {}
+  EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() {}
+
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::value_known_statically(i, *this);
+  }
+  EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::values_up_to_known_statically(*this);
+  }
+
+  EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::values_up_to_statically_known_to_increase(*this);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+std::ostream& operator<<(std::ostream& os, const IndexList<FirstType, OtherTypes...>& dims) {
+  os << "[";
+  for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename FirstType, typename... OtherTypes>
+constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
+  return IndexList<FirstType, OtherTypes...>(val1, other_vals...);
+}
+
+template <typename FirstType, typename... OtherTypes>
+struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<Index> operator[](const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 IndexPair<Index>>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair<Index> value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 IndexPair<Index>>::set(i, *this, value);
+  }
+
+  EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other)
+      : internal::IndexTuple<FirstType, OtherTypes...>(other) {}
+  EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() {}
+
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::value_known_statically(i, *this);
+  }
+};
+
+namespace internal {
+
+template <typename FirstType, typename... OtherTypes>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+  Index result = 1;
+  EIGEN_UNROLL_LOOP
+  for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...>>::value; ++i) {
+    result *= sizes[i];
+  }
+  return result;
+}
+
+template <typename FirstType, typename... OtherTypes>
+struct array_size<IndexList<FirstType, OtherTypes...>> {
+  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...>>::value;
+};
+template <typename FirstType, typename... OtherTypes>
+struct array_size<const IndexList<FirstType, OtherTypes...>> {
+  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...>>::value;
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct array_size<IndexPairList<FirstType, OtherTypes...>> {
+  static const size_t value = 1 + sizeof...(OtherTypes);
+};
+template <typename FirstType, typename... OtherTypes>
+struct array_size<const IndexPairList<FirstType, OtherTypes...>> {
+  static const size_t value = 1 + sizeof...(OtherTypes);
+};
+
+template <Index N, typename FirstType, typename... OtherTypes>
+EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList<FirstType, OtherTypes...>& a) {
+  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
+}
+template <Index N, typename FirstType, typename... OtherTypes>
+EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList<FirstType, OtherTypes...>& a) {
+  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
+}
+
+template <typename T>
+struct index_known_statically_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+  }
+};
+
+template <typename T>
+struct all_indices_known_statically_impl {
+  static constexpr bool run() { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+  }
+};
+
+template <typename T>
+struct indices_statically_known_to_increase_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run() { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct indices_statically_known_to_increase_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct indices_statically_known_to_increase_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  }
+};
+
+template <typename Tx>
+struct index_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) == value);
+  }
+};
+
+template <typename T>
+struct index_statically_ne_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) != value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) != value);
+  }
+};
+
+template <typename T>
+struct index_statically_gt_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) > value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) > value);
+  }
+};
+
+template <typename T>
+struct index_statically_lt_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) < value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) < value);
+  }
+};
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+  }
+};
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+namespace Eigen {
+namespace internal {
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) {
+  return index_known_statically_impl<T>::run(i);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() {
+  return all_indices_known_statically_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() {
+  return indices_statically_known_to_increase_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) {
+  return index_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) {
+  return index_statically_ne_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) {
+  return index_statically_gt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) {
+  return index_statically_lt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) {
+  return index_pair_first_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) {
+  return index_pair_second_statically_eq_impl<T>::run(i, value);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
new file mode 100644
index 00000000000..5b0fca8b66a
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -0,0 +1,226 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorInflation
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor inflation class.
+ *
+ *
+ */
+namespace internal {
+template <typename Strides, typename XprType>
+struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Strides, typename XprType>
+struct eval<TensorInflationOp<Strides, XprType>, Eigen::Dense> {
+  typedef const TensorInflationOp<Strides, XprType>& type;
+};
+
+template <typename Strides, typename XprType>
+struct nested<TensorInflationOp<Strides, XprType>, 1, typename eval<TensorInflationOp<Strides, XprType> >::type> {
+  typedef TensorInflationOp<Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename Strides, typename XprType>
+class TensorInflationOp : public TensorBase<TensorInflationOp<Strides, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides)
+      : m_xpr(expr), m_strides(strides) {}
+
+  EIGEN_DEVICE_FUNC const Strides& strides() const { return m_strides; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Strides m_strides;
+};
+
+// Eval as rvalue
+template <typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> {
+  typedef TensorInflationOp<Strides, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_strides(op.strides()) {
+    m_dimensions = m_impl.dimensions();
+    // Expand each dimension to the inflated dimension.
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1;
+    }
+
+    // Remember the strides for fast division.
+    for (int i = 0; i < NumDims; ++i) {
+      m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+      }
+    } else {  // RowMajor
+      m_outputStrides[NumDims - 1] = 1;
+      m_inputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  // Computes the input index given the output index. Returns true if the output
+  // index doesn't fall into a hole.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const {
+    eigen_assert(index < dimensions().TotalSize());
+    *inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
+          return false;
+        }
+        *inputIndex += idx / m_strides[i] * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (index != index / m_fastStrides[0] * m_strides[0]) {
+        return false;
+      }
+      *inputIndex += index / m_strides[0];
+      return true;
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
+          return false;
+        }
+        *inputIndex += idx / m_strides[i] * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (index != index / m_fastStrides[NumDims - 1] * m_strides[NumDims - 1]) {
+        return false;
+      }
+      *inputIndex += index / m_strides[NumDims - 1];
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    Index inputIndex = 0;
+    if (getInputIndex(index, &inputIndex)) {
+      return m_impl.coeff(inputIndex);
+    } else {
+      return Scalar(0);
+    }
+  }
+
+  // TODO(yangke): optimize this function so that we can detect and produce
+  // all-zero packets
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() + 3 * TensorOpCost::MulCost<Index>() +
+                                           2 * TensorOpCost::AddCost<Index>());
+    const double input_size = m_impl.dimensions().TotalSize();
+    const double output_size = m_dimensions.TotalSize();
+    if (output_size == 0) return TensorOpCost();
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Strides m_strides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
new file mode 100644
index 00000000000..85cf492a001
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+
+#include <initializer_list>
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorInitializer
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Helper template to initialize Tensors from std::initializer_lists.
+ */
+namespace internal {
+
+template <typename Derived, int N>
+struct Initializer {
+  typedef std::initializer_list<typename Initializer<Derived, N - 1>::InitList> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    int i = 0;
+    for (const auto& v : vals) {
+      (*indices)[traits<Derived>::NumDimensions - N] = i++;
+      Initializer<Derived, N - 1>::run(tensor, indices, v);
+    }
+  }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 1> {
+  typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    int i = 0;
+    // There is likely a faster way to do that than iterating.
+    for (const auto& v : vals) {
+      (*indices)[traits<Derived>::NumDimensions - 1] = i++;
+      tensor.coeffRef(*indices) = v;
+    }
+  }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 0> {
+  typedef typename traits<Derived>::Scalar InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>*, const InitList& v) {
+    tensor.coeffRef(0) = v;
+  }
+};
+
+template <typename Derived, int N>
+void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                       const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
+  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
+  Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
new file mode 100644
index 00000000000..fddc64822d0
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -0,0 +1,261 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorIntDiv
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Fast integer division by a constant.
+ *
+ * See the paper from Granlund and Montgomery for explanation.
+ *   (at https://doi.org/10.1145/773473.178249)
+ *
+ * \sa Tensor
+ */
+
+namespace internal {
+
+// Note: result is undefined if val == 0
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<sizeof(T) == 4, int> count_leading_zeros(const T val) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+  return __clz(val);
+#elif defined(SYCL_DEVICE_ONLY)
+  return cl::sycl::clz(val);
+#elif EIGEN_COMP_MSVC
+  unsigned long index;
+  _BitScanReverse(&index, val);
+  return 31 - index;
+#else
+  EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<sizeof(T) == 8, int> count_leading_zeros(const T val) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+  return __clzll(val);
+#elif defined(SYCL_DEVICE_ONLY)
+  return static_cast<int>(cl::sycl::clz(val));
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+  unsigned long index;
+  _BitScanReverse64(&index, val);
+  return 63 - index;
+#elif EIGEN_COMP_MSVC
+  // MSVC's _BitScanReverse64 is not available for 32bits builds.
+  unsigned int lo = (unsigned int)(val & 0xffffffff);
+  unsigned int hi = (unsigned int)((val >> 32) & 0xffffffff);
+  int n;
+  if (hi == 0)
+    n = 32 + count_leading_zeros<unsigned int>(lo);
+  else
+    n = count_leading_zeros<unsigned int>(hi);
+  return n;
+#else
+  EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  return __builtin_clzll(static_cast<uint64_t>(val));
+#endif
+}
+
+template <typename T>
+struct UnsignedTraits {
+  typedef std::conditional_t<sizeof(T) == 8, uint64_t, uint32_t> type;
+};
+
+template <typename T>
+struct DividerTraits {
+  typedef typename UnsignedTraits<T>::type type;
+  static constexpr int N = sizeof(T) * 8;
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return __umulhi(a, b);
+#elif defined(SYCL_DEVICE_ONLY)
+  return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
+#else
+  return (static_cast<uint64_t>(a) * b) >> 32;
+#endif
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return __umul64hi(a, b);
+#elif defined(SYCL_DEVICE_ONLY)
+  return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
+#elif EIGEN_COMP_MSVC && (EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64)
+  return __umulh(a, static_cast<uint64_t>(b));
+#elif EIGEN_HAS_BUILTIN_INT128
+  __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
+  return static_cast<uint64_t>(v >> 64);
+#else
+  return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
+#endif
+}
+
+template <int N, typename T>
+struct DividerHelper {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
+    EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N + log_div)) / divider -
+                                 (static_cast<uint64_t>(1) << N) + 1);
+  }
+};
+
+template <typename T>
+struct DividerHelper<64, T> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
+#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+    return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64 + log_div)) / static_cast<__uint128_t>(divider) -
+                                 (static_cast<__uint128_t>(1) << 64) + 1);
+#else
+    const uint64_t shift = 1ULL << log_div;
+    TensorUInt128<uint64_t, uint64_t> result =
+        TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) -
+        TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1);
+    return static_cast<uint64_t>(result);
+#endif
+  }
+};
+
+template <typename T, bool div_gt_one = false>
+struct TensorIntDivisor {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    multiplier = 0;
+    shift1 = 0;
+    shift2 = 0;
+  }
+
+  // Must have 0 < divider < 2^31. This is relaxed to
+  // 0 < divider < 2^63 when using 64-bit indices on platforms that support
+  // the __uint128_t type.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+    const int N = DividerTraits<T>::N;
+    eigen_assert(static_cast<typename UnsignedTraits<T>::type>(divider) < NumTraits<UnsignedType>::highest() / 2);
+    eigen_assert(divider > 0);
+
+    // fast ln2
+    const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider));
+    int log_div = N - leading_zeros;
+    // if divider is a power of two then log_div is 1 more than it should be.
+    if ((static_cast<typename UnsignedTraits<T>::type>(1) << (log_div - 1)) ==
+        static_cast<typename UnsignedTraits<T>::type>(divider))
+      log_div--;
+
+    multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider);
+    shift1 = log_div > 1 ? 1 : log_div;
+    shift2 = log_div > 1 ? log_div - 1 : 0;
+  }
+
+  // Must have 0 <= numerator. On platforms that don't support the __uint128_t
+  // type numerator should also be less than 2^32-1.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+    eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest() / 2);
+    // eigen_assert(numerator >= 0); // this is implicitly asserted by the line above
+
+    UnsignedType t1 = muluh(multiplier, numerator);
+    UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1;
+    return (t1 + t) >> shift2;
+  }
+
+ private:
+  typedef typename DividerTraits<T>::type UnsignedType;
+  UnsignedType multiplier;
+  int32_t shift1;
+  int32_t shift2;
+};
+
+// Optimized version for signed 32 bit integers.
+// Derived from Hacker's Delight.
+// Only works for divisors strictly greater than one
+template <>
+class TensorIntDivisor<int32_t, true> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    magic = 0;
+    shift = 0;
+  }
+  // Must have 2 <= divider
+  EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) {
+    eigen_assert(divider >= 2);
+    calcMagic(divider);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+    return (__umulhi(magic, n) >> shift);
+#elif defined(SYCL_DEVICE_ONLY)
+    return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
+#else
+    uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
+    return (static_cast<uint32_t>(v >> 32) >> shift);
+#endif
+  }
+
+ private:
+  // Compute the magic numbers. See Hacker's Delight section 10 for an in
+  // depth explanation.
+  EIGEN_DEVICE_FUNC void calcMagic(int32_t d) {
+    const unsigned two31 = 0x80000000;  // 2**31.
+    unsigned ad = d;
+    unsigned t = two31 + (ad >> 31);
+    unsigned anc = t - 1 - t % ad;   // Absolute value of nc.
+    int p = 31;                      // Init. p.
+    unsigned q1 = two31 / anc;       // Init. q1 = 2**p/|nc|.
+    unsigned r1 = two31 - q1 * anc;  // Init. r1 = rem(2**p, |nc|).
+    unsigned q2 = two31 / ad;        // Init. q2 = 2**p/|d|.
+    unsigned r2 = two31 - q2 * ad;   // Init. r2 = rem(2**p, |d|).
+    unsigned delta = 0;
+    do {
+      p = p + 1;
+      q1 = 2 * q1;      // Update q1 = 2**p/|nc|.
+      r1 = 2 * r1;      // Update r1 = rem(2**p, |nc|).
+      if (r1 >= anc) {  // (Must be an unsigned
+        q1 = q1 + 1;    // comparison here).
+        r1 = r1 - anc;
+      }
+      q2 = 2 * q2;     // Update q2 = 2**p/|d|.
+      r2 = 2 * r2;     // Update r2 = rem(2**p, |d|).
+      if (r2 >= ad) {  // (Must be an unsigned
+        q2 = q2 + 1;   // comparison here).
+        r2 = r2 - ad;
+      }
+      delta = ad - r2;
+    } while (q1 < delta || (q1 == delta && r1 == 0));
+
+    magic = (unsigned)(q2 + 1);
+    shift = p - 32;
+  }
+
+  uint32_t magic;
+  int32_t shift;
+};
+
+template <typename T, bool div_gt_one>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator/(const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
+  return divisor.divide(numerator);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
new file mode 100644
index 00000000000..43f4bcb4656
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -0,0 +1,185 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorLayoutSwap
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Swap the layout from col-major to row-major, or row-major
+ * to col-major, and invert the order of the dimensions.
+ *
+ * Beware: the dimensions are reversed by this operation. If you want to
+ * preserve the ordering of the dimensions, you need to combine this
+ * operation with a shuffle.
+ *
+ * \example:
+ * Tensor<float, 2, ColMajor> input(2, 4);
+ * Tensor<float, 2, RowMajor> output = input.swap_layout();
+ * eigen_assert(output.dimension(0) == 4);
+ * eigen_assert(output.dimension(1) == 2);
+ *
+ * array<int, 2> shuffle(1, 0);
+ * output = input.swap_layout().shuffle(shuffle);
+ * eigen_assert(output.dimension(0) == 2);
+ * eigen_assert(output.dimension(1) == 4);
+ *
+ */
+namespace internal {
+template <typename XprType>
+struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = traits<XprType>::NumDimensions;
+  static constexpr int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename XprType>
+struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense> {
+  typedef const TensorLayoutSwapOp<XprType>& type;
+};
+
+template <typename XprType>
+struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type> {
+  typedef TensorLayoutSwapOp<XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename XprType>
+class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> {
+ public:
+  typedef TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> Base;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) : m_xpr(expr) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp)
+ protected:
+  typename XprType::Nested m_xpr;
+};
+
+// Eval as rvalue
+template <typename ArgType, typename Device>
+struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> {
+  typedef TensorLayoutSwapOp<ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  static constexpr int Layout =
+      (TensorEvaluator<ArgType, Device>::Layout == static_cast<int>(ColMajor)) ? RowMajor : ColMajor;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = m_impl.dimensions()[NumDims - 1 - i];
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_impl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized);
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  Dimensions m_dimensions;
+};
+
+// Eval as lvalue
+template <typename ArgType, typename Device>
+struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
+    : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> {
+  typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
+  typedef TensorLayoutSwapOp<ArgType> XprType;
+
+  static constexpr int Layout =
+      (TensorEvaluator<ArgType, Device>::Layout == static_cast<int>(ColMajor)) ? RowMajor : ColMajor;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false  // to be implemented
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(index);
+  }
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    this->m_impl.template writePacket<StoreMode>(index, x);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
new file mode 100644
index 00000000000..f8bbcfee74a
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+
+/** use this macro in sfinae selection in templated functions
+ *
+ *   template<typename T,
+ *            std::enable_if_t< isBanana<T>::value , int > = 0
+ *   >
+ *   void foo(){}
+ *
+ *   becomes =>
+ *
+ *   template<typename TopoType,
+ *           SFINAE_ENABLE_IF( isBanana<T>::value )
+ *   >
+ *   void foo(){}
+ */
+
+#define EIGEN_SFINAE_ENABLE_IF(__condition__) std::enable_if_t<(__condition__), int> = 0
+
+// Define a macro to use a reference on the host but a value on the device
+#if defined(SYCL_DEVICE_ONLY)
+#define EIGEN_DEVICE_REF
+#else
+#define EIGEN_DEVICE_REF &
+#endif
+
+// Define a macro for catching SYCL exceptions if exceptions are enabled
+#define EIGEN_SYCL_TRY_CATCH(X)                                                                                        \
+  do {                                                                                                                 \
+    EIGEN_TRY { X; }                                                                                                   \
+    EIGEN_CATCH(const cl::sycl::exception& e) {                                                                        \
+      EIGEN_THROW_X(std::runtime_error("SYCL exception at " + std::string(__FILE__) + ":" + std::to_string(__LINE__) + \
+                                       "\n" + e.what()));                                                              \
+    }                                                                                                                  \
+  } while (false)
+
+// Define a macro if local memory flags are unset or one of them is set
+// Setting both flags is the same as unsetting them
+#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
+    (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM))
+#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#endif
+
+#if EIGEN_COMP_CLANG  // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)                         \
+  using Base::operator=;                                                                \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) {      \
+    Base::operator=(other);                                                             \
+    return *this;                                                                       \
+  }                                                                                     \
+  template <typename OtherDerived>                                                      \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { \
+    Base::operator=(other);                                                             \
+    return *this;                                                                       \
+  }
+#else
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
+#endif
+
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is
+ * defined. This also inherits template<OtherDerived> operator=(const OtherDerived&) assignments. With C++11 or later
+ * this also default-implements the copy-constructor
+ */
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)  \
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
+
+#endif
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
new file mode 100644
index 00000000000..191413b7d03
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -0,0 +1,191 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
+
+/** \class TensorMap
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A tensor expression mapping an existing array of data.
+ *
+ */
+/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
+/// It is added due to the fact that for our device compiler `T*` is not allowed.
+/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
+/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
+/// Therefore, by adding the default value, we managed to convert the type and it does not break any
+/// existing code as its default value is `T*`.
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > {
+ public:
+  typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
+  typedef TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > Base;
+#ifdef EIGEN_USE_SYCL
+  typedef std::remove_reference_t<typename Eigen::internal::nested<Self>::type> Nested;
+#else
+  typedef typename Eigen::internal::nested<Self>::type Nested;
+#endif
+  typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+  typedef typename internal::traits<PlainObjectType>::Index Index;
+  typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType;
+
+  typedef typename MakePointer_<Scalar>::Type PointerType;
+  typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
+
+  // WARN: PointerType still can be a pointer to const (const Scalar*), for
+  // example in TensorMap<Tensor<const Scalar, ...>> expression. This type of
+  // expression should be illegal, but adding this restriction is not possible
+  // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488).
+  typedef std::conditional_t<bool(internal::is_lvalue<PlainObjectType>::value),
+                             PointerType,      // use simple pointer in lvalue expressions
+                             PointerConstType  // use const pointer in rvalue expressions
+                             >
+      StoragePointerType;
+
+  // If TensorMap was constructed over rvalue expression (e.g. const Tensor),
+  // we should return a reference to const from operator() (and others), even
+  // if TensorMap itself is not const.
+  typedef std::conditional_t<bool(internal::is_lvalue<PlainObjectType>::value), Scalar&, const Scalar&> StorageRefType;
+
+  static constexpr int Options = Options_;
+
+  static constexpr Index NumIndices = PlainObjectType::NumIndices;
+  typedef typename PlainObjectType::Dimensions Dimensions;
+
+  static constexpr int Layout = PlainObjectType::Layout;
+  enum { IsAligned = ((int(Options_) & Aligned) == Aligned), CoordAccess = true, RawAccess = true };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() {
+    // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension,
+                                                  IndexTypes... otherDimensions)
+      : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+    // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr,
+                                                  const array<Index, NumIndices>& dimensions)
+      : m_data(dataPtr), m_dimensions(dimensions) {}
+
+  template <typename Dimensions>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions)
+      : m_data(dataPtr), m_dimensions(dimensions) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor)
+      : m_data(tensor.data()), m_dimensions(tensor.dimensions()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const {
+    //      eigen_assert(checkIndexRange(indices));
+    if (PlainObjectType::Options & RowMajor) {
+      const Index index = m_dimensions.IndexOfRowMajor(indices);
+      return m_data[index];
+    } else {
+      const Index index = m_dimensions.IndexOfColMajor(indices);
+      return m_data[index];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()() const {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return m_data[0];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_data[index];
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex,
+                                                                  IndexTypes... otherIndices) const {
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
+    if (PlainObjectType::Options & RowMajor) {
+      const Index index =
+          m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return m_data[index];
+    } else {
+      const Index index =
+          m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return m_data[index];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) {
+    //      eigen_assert(checkIndexRange(indices));
+    if (PlainObjectType::Options & RowMajor) {
+      const Index index = m_dimensions.IndexOfRowMajor(indices);
+      return m_data[index];
+    } else {
+      const Index index = m_dimensions.IndexOfColMajor(indices);
+      return m_data[index];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return m_data[0];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index index) {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_data[index];
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex,
+                                                                  IndexTypes... otherIndices) {
+    static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic,
+                  "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+    eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
+    const std::size_t NumDims = sizeof...(otherIndices) + 2;
+    if (PlainObjectType::Options & RowMajor) {
+      const Index index =
+          m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
+      return m_data[index];
+    } else {
+      const Index index =
+          m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
+      return m_data[index];
+    }
+  }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap)
+
+ private:
+  StoragePointerType m_data;
+  Dimensions m_dimensions;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_MAP_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
new file mode 100644
index 00000000000..8c2bb2ef9e2
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -0,0 +1,292 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <bool cond>
+struct Cond {};
+
+template <typename T1, typename T2>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& choose(Cond<true>, const T1& first, const T2&) {
+  return first;
+}
+
+template <typename T1, typename T2>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T2& choose(Cond<false>, const T1&, const T2& second) {
+  return second;
+}
+
+template <size_t n>
+struct max_n_1 {
+  static const size_t size = n;
+};
+template <>
+struct max_n_1<0> {
+  static const size_t size = 1;
+};
+
+template <typename T>
+EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T divup(const T x, const T y) {
+  return Eigen::numext::div_ceil(x, y);
+}
+
+// Default packet types
+template <typename Scalar, typename Device>
+struct PacketType : internal::packet_traits<Scalar> {
+  typedef typename internal::packet_traits<Scalar>::type type;
+};
+
+// For CUDA packet types when using a GpuDevice
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)
+
+typedef ulonglong2 Packet4h2;
+template <>
+struct PacketType<half, GpuDevice> {
+  typedef Packet4h2 type;
+  static const int size = 8;
+  enum {
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0,
+    HasBlend = 0,
+
+    HasDiv = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasExp = 1,
+    HasExpm1 = 0,
+    HasLog = 1,
+    HasLog1p = 0,
+    HasLog10 = 0,
+    HasPow = 1,
+  };
+};
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+
+namespace TensorSycl {
+namespace internal {
+
+template <typename Index, Index A, Index B>
+struct PlusOp {
+  static constexpr Index Value = A + B;
+};
+
+template <typename Index, Index A, Index B>
+struct DivOp {
+  static constexpr Index Value = A / B;
+};
+
+template <typename Index, Index start, Index end, Index step, template <class Indx, Indx...> class StepOp>
+struct static_for {
+  template <typename UnaryOperator>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
+    op(start);
+    static_for<Index, StepOp<Index, start, step>::Value, end, step, StepOp>::loop(op);
+  }
+};
+template <typename Index, Index end, Index step, template <class Indx, Indx...> class StepOp>
+struct static_for<Index, end, end, step, StepOp> {
+  template <typename UnaryOperator>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
+};
+
+template <typename OutScalar, typename Device, bool Vectorizable>
+struct Vectorise {
+  static constexpr int PacketSize = 1;
+  typedef OutScalar PacketReturnType;
+};
+
+template <typename OutScalar, typename Device>
+struct Vectorise<OutScalar, Device, true> {
+  static constexpr int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
+  typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
+};
+
+static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { return ((((x) + (y)-1) / (y)) * (y)); }
+
+}  // namespace internal
+}  // namespace TensorSycl
+
+template <>
+struct PacketType<half, SyclDevice> {
+  typedef half type;
+  static const int size = 1;
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasArg = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasConj = 0,
+    HasSetLinear = 0,
+    HasBlend = 0
+  };
+};
+template <typename Scalar>
+struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
+  typedef Scalar type;
+  typedef Scalar half;
+  enum {
+    Vectorizable = 0,
+    size = 1,
+    AlignedOnScalar = 0,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <typename Scalar>
+struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice> {};
+
+#ifndef EIGEN_DONT_VECTORIZE_SYCL
+#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)                                 \
+  template <>                                                                        \
+  struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> { \
+    typedef typename internal::packet_traits<Type>::type type;                       \
+    typedef typename internal::packet_traits<Type>::half half;                       \
+  };
+
+PACKET_TYPE(const, float, 1, 4, SyclDevice)
+PACKET_TYPE(, float, 1, 4, SyclDevice)
+PACKET_TYPE(const, float, 1, 4, const SyclDevice)
+PACKET_TYPE(, float, 1, 4, const SyclDevice)
+
+PACKET_TYPE(const, double, 0, 2, SyclDevice)
+PACKET_TYPE(, double, 0, 2, SyclDevice)
+PACKET_TYPE(const, double, 0, 2, const SyclDevice)
+PACKET_TYPE(, double, 0, 2, const SyclDevice)
+#undef PACKET_TYPE
+
+template <>
+struct PacketType<half, const SyclDevice> : PacketType<half, SyclDevice> {};
+template <>
+struct PacketType<const half, const SyclDevice> : PacketType<half, SyclDevice> {};
+#endif
+#endif
+
+// Pair mimics std::pair but works on e.g. nvcc.
+template <typename U, typename V>
+struct Pair {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+  U first;
+  V second;
+
+  typedef U first_type;
+  typedef V second_type;
+
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair() : first(), second() {}
+
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair(const U& f, const V& s) : first(f), second(s) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(Pair& rhs) {
+    using numext::swap;
+    swap(first, rhs.first);
+    swap(second, rhs.second);
+  }
+};
+
+template <typename U, typename V>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const Pair<U, V>& x, const Pair<U, V>& y) {
+  return (x.first == y.first && x.second == y.second);
+}
+
+template <typename U, typename V>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const Pair<U, V>& x, const Pair<U, V>& y) {
+  return !(x == y);
+}
+
+// Can't use std::pairs on cuda devices
+template <typename Idx>
+struct IndexPair {
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+
+  EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
+    first = val.first;
+    second = val.second;
+  }
+
+  Idx first;
+  Idx second;
+};
+
+namespace internal {
+
+template <typename IndexType, typename Index, Index First, Index... Is>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 1 + sizeof...(Is)> customIndices2Array(
+    IndexType& idx, numeric_list<Index, First, Is...>) {
+  return {static_cast<Index>(idx[First]), static_cast<Index>(idx[Is])...};
+}
+template <typename IndexType, typename Index>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 0> customIndices2Array(IndexType&,
+                                                                                          numeric_list<Index>) {
+  return array<Index, 0>();
+}
+
+/** Make an array (for index/dimensions) out of a custom index */
+template <typename Index, std::size_t NumIndices, typename IndexType>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, NumIndices> customIndices2Array(IndexType& idx) {
+  return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
+}
+
+template <typename B, typename D>
+struct is_base_of {
+  typedef char (&yes)[1];
+  typedef char (&no)[2];
+
+  template <typename BB, typename DD>
+  struct Host {
+    operator BB*() const;
+    operator DD*();
+  };
+
+  template <typename T>
+  static yes check(D*, T);
+  static no check(B*, int);
+
+  static const bool value = sizeof(check(Host<B, D>(), int())) == sizeof(yes);
+};
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_META_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
new file mode 100644
index 00000000000..d790f35bb9c
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -0,0 +1,984 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorReshaping
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ *
+ *
+ */
+namespace internal {
+template <typename NewDimensions, typename XprType>
+struct traits<TensorReshapingOp<NewDimensions, XprType>> : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = array_size<NewDimensions>::value;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename NewDimensions, typename XprType>
+struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense> {
+  typedef const TensorReshapingOp<NewDimensions, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename NewDimensions, typename XprType>
+struct nested<TensorReshapingOp<NewDimensions, XprType>, 1,
+              typename eval<TensorReshapingOp<NewDimensions, XprType>>::type> {
+  typedef TensorReshapingOp<NewDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename NewDimensions, typename XprType>
+class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> {
+ public:
+  typedef TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> Base;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+  EIGEN_DEVICE_FUNC const NewDimensions& dimensions() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const NewDimensions m_dims;
+};
+
+// Eval as rvalue
+template <typename NewDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> {
+  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+  typedef NewDimensions Dimensions;
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<std::remove_const_t<CoeffReturnType>, Device> ConstCastStorage;
+
+  static constexpr int NumOutputDims = internal::array_size<Dimensions>::value;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+
+  enum ReshapingKind {
+    // We do not use layout information to determine reshaping kind.
+    // Depending on the layout `N` can be inner or outer dimension.
+    OneByN = 0,  // expr.reshape(1, N)
+    NByOne = 1,  // expr.reshape(N, 1)
+    Runtime = 2  // Reshape dimensions are dynamic (specified at runtime).
+  };
+
+  // clang-format off
+  static const ReshapingKind kind =
+        (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN
+      : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne
+      : Runtime;
+  // clang-format on
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    // For trivial reshapes with raw access to underlying data we will provide
+    // zero overhead block access.
+    // TODO(ezhulenev): Consider adding block access without raw access?
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess && NumInputDims > 0 && NumOutputDims > 0,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dimensions(op.dimensions()) {
+    // The total size of the reshaped tensor must be equal to the total size
+    // of the input tensor.
+    eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(data, std::move(done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_impl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  // required in block(OutputTensorBlock* output_block) const
+  // For C++03 compatibility this must be defined outside the method
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_impl.data() != NULL);
+    eigen_assert((kind == Runtime) || (kind == OneByN && desc.dimensions()[0] == 1) ||
+                 (kind == NByOne && desc.dimensions()[1] == 1));
+
+    if (kind == OneByN || kind == NByOne) {
+      // We can guarantee at compile time that block is just a contiguous slice
+      // of the underlying expression memory buffer.
+      return TensorBlock(internal::TensorBlockKind::kView, m_impl.data() + desc.offset(), desc.dimensions());
+    } else {
+      // This will do additional runtime checks, and in the end it might be also
+      // a view, or it might be a block materialized in the temporary buffer.
+      return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, scratch);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); }
+
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  NewDimensions m_dimensions;
+};
+
+// Eval as lvalue
+template <typename NewDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device>
+    : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+
+{
+  typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base;
+  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+  typedef NewDimensions Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(index);
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    this->m_impl.template writePacket<StoreMode>(index, x);
+  }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(TensorBlockAssign::target(desc.dimensions(), internal::strides<Layout>(this->dimensions()),
+                                                     this->m_impl.data(), desc.offset()),
+                           block.expr());
+  }
+};
+
+/** \class TensorSlicing
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor slicing class.
+ *
+ *
+ */
+namespace internal {
+template <typename StartIndices, typename Sizes, typename XprType>
+struct traits<TensorSlicingOp<StartIndices, Sizes, XprType>> : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = array_size<StartIndices>::value;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename StartIndices, typename Sizes, typename XprType>
+struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense> {
+  typedef const TensorSlicingOp<StartIndices, Sizes, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename StartIndices, typename Sizes, typename XprType>
+struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1,
+              typename eval<TensorSlicingOp<StartIndices, Sizes, XprType>>::type> {
+  typedef TensorSlicingOp<StartIndices, Sizes, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename StartIndices, typename Sizes, typename XprType>
+class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType>> {
+ public:
+  typedef TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType>> Base;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices,
+                                                        const Sizes& sizes)
+      : m_xpr(expr), m_indices(indices), m_sizes(sizes) {}
+
+  EIGEN_DEVICE_FUNC const StartIndices& startIndices() const { return m_indices; }
+  EIGEN_DEVICE_FUNC const Sizes& sizes() const { return m_sizes; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const StartIndices m_indices;
+  const Sizes m_sizes;
+};
+
+namespace internal {
+
+// Fixme: figure out the exact threshold
+template <typename Index, typename Device, bool BlockAccess>
+struct MemcpyTriggerForSlicing {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) {}
+  EIGEN_DEVICE_FUNC bool operator()(Index total, Index contiguous) const {
+    const bool prefer_block_evaluation = BlockAccess && total > 32 * 1024;
+    return !prefer_block_evaluation && contiguous > threshold_;
+  }
+
+ private:
+  Index threshold_;
+};
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_GPU
+template <typename Index, bool BlockAccess>
+struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess> {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) {}
+  EIGEN_DEVICE_FUNC bool operator()(Index, Index contiguous) const { return contiguous > 4 * 1024 * 1024; }
+};
+#endif
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_SYCL
+template <typename Index, bool BlockAccess>
+struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess> {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) {}
+  EIGEN_DEVICE_FUNC bool operator()(Index, Index contiguous) const { return contiguous > 4 * 1024 * 1024; }
+};
+#endif
+
+}  // namespace internal
+
+// Eval as rvalue
+template <typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> {
+  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+  static constexpr int NumDims = internal::array_size<Sizes>::value;
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef StorageMemory<std::remove_const_t<CoeffReturnType>, Device> ConstCastStorage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets and sizes.
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
+                  // FIXME: Temporary workaround for bug in slicing of bool tensors.
+                  !internal::is_same<std::remove_const_t<Scalar>, bool>::value,
+    PreferBlockAccess = true,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  // Tensor slicing does not change the block type.
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) {
+    m_is_identity = true;
+    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
+      if (m_impl.dimensions()[i] != op.sizes()[i] || op.startIndices()[i] != 0) {
+        m_is_identity = false;
+      }
+    }
+
+    // No strides for scalars.
+    if (NumDims == 0) return;
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Sizes& output_dims = op.sizes();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+      }
+
+      // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * output_dims[i - 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+      }
+    } else {
+      m_inputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+      }
+
+      // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed.
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * output_dims[i + 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (!NumTraits<std::remove_const_t<Scalar>>::RequireInitialization && data && m_impl.data()) {
+      Index contiguous_values = 1;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = 0; i < NumDims; ++i) {
+          contiguous_values *= dimensions()[i];
+          if (dimensions()[i] != m_impl.dimensions()[i]) {
+            break;
+          }
+        }
+      } else {
+        for (int i = NumDims - 1; i >= 0; --i) {
+          contiguous_values *= dimensions()[i];
+          if (dimensions()[i] != m_impl.dimensions()[i]) {
+            break;
+          }
+        }
+      }
+      // Use memcpy if it's going to be faster than using the regular evaluation.
+      const internal::MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
+      if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
+        EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
+        for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+          Index offset = srcCoeff(i);
+          m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src + offset),
+                          contiguous_values * sizeof(Scalar));
+        }
+        return false;
+      }
+    }
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < internal::array_prod(dimensions()));
+
+    if (m_is_identity) {
+      return m_impl.template packet<LoadMode>(index);
+    }
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + m_offsets[0]);
+      inputIndices[1] += (indices[1] + m_offsets[0]);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + m_offsets[NumDims - 1]);
+      inputIndices[1] += (indices[1] + m_offsets[NumDims - 1]);
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize - 1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < packetSize - 1; ++i) {
+        values[i] = coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size), m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
+    TensorBlock block = m_impl.block(arg_desc, scratch);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+    return block;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    typename Storage::Type result = constCast(m_impl.data());
+    if (result) {
+      Index offset = 0;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = 0; i < NumDims; ++i) {
+          if (m_dimensions[i] != m_impl.dimensions()[i]) {
+            offset += m_offsets[i] * m_inputStrides[i];
+            for (int j = i + 1; j < NumDims; ++j) {
+              if (m_dimensions[j] > 1) {
+                return NULL;
+              }
+              offset += m_offsets[j] * m_inputStrides[j];
+            }
+            break;
+          }
+        }
+      } else {
+        for (int i = NumDims - 1; i >= 0; --i) {
+          if (m_dimensions[i] != m_impl.dimensions()[i]) {
+            offset += m_offsets[i] * m_inputStrides[i];
+            for (int j = i - 1; j >= 0; --j) {
+              if (m_dimensions[j] > 1) {
+                return NULL;
+              }
+              offset += m_offsets[j] * m_inputStrides[j];
+            }
+            break;
+          }
+        }
+      }
+      return result + offset;
+    }
+    return NULL;
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += (index + m_offsets[0]);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += (index + m_offsets[NumDims - 1]);
+    }
+    return inputIndex;
+  }
+
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
+  Dimensions m_dimensions;
+  bool m_is_identity;
+  const StartIndices m_offsets;
+};
+
+// Eval as lvalue
+template <typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+    : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base;
+  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+  static constexpr int NumDims = internal::array_size<Sizes>::value;
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    CoordAccess = false,
+    RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    if (this->m_is_identity) {
+      this->m_impl.template writePacket<StoreMode>(index, x);
+      return;
+    }
+
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + this->m_offsets[0]);
+      inputIndices[1] += (indices[1] + this->m_offsets[0]);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + this->m_offsets[NumDims - 1]);
+      inputIndices[1] += (indices[1] + this->m_offsets[NumDims - 1]);
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
+    } else {
+      EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+      this->m_impl.coeffRef(inputIndices[0]) = values[0];
+      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize - 1];
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < packetSize - 1; ++i) {
+        this->coeffRef(index + i) = values[i];
+      }
+    }
+  }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) {
+    TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
+    this->m_impl.writeBlock(arg_desc, block);
+  }
+};
+
+namespace internal {
+template <typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>> : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = array_size<StartIndices>::value;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense> {
+  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1,
+              typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>>::type> {
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+class TensorStridingSlicingOp
+    : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>> {
+ public:
+  typedef TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>> Base;
+  typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
+  typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(const XprType& expr, const StartIndices& startIndices,
+                                                                const StopIndices& stopIndices, const Strides& strides)
+      : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), m_strides(strides) {}
+
+  EIGEN_DEVICE_FUNC const StartIndices& startIndices() const { return m_startIndices; }
+  EIGEN_DEVICE_FUNC const StartIndices& stopIndices() const { return m_stopIndices; }
+  EIGEN_DEVICE_FUNC const StartIndices& strides() const { return m_strides; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const StartIndices m_startIndices;
+  const StopIndices m_stopIndices;
+  const Strides m_strides;
+};
+
+// Eval as rvalue
+template <typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> {
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+  static constexpr int NumDims = internal::array_size<Strides>::value;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef Strides Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets and sizes.
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) {
+    // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
+    DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped;
+    for (ptrdiff_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
+      if (m_strides[i] > 0) {
+        startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+      } else {
+        /* implies m_strides[i] < 0 by assert */
+        startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+      }
+      m_startIndices[i] = startIndicesClamped[i];
+    }
+
+    typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+    const InputDimensions& input_dims = m_impl.dimensions();
+
+    // compute output tensor shape
+    m_is_identity = true;
+    for (int i = 0; i < NumDims; i++) {
+      Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
+      if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) {
+        m_dimensions[i] = 0;
+      } else {
+        m_dimensions[i] = (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0);
+        eigen_assert(m_dimensions[i] >= 0);
+      }
+      if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) {
+        m_is_identity = false;
+      }
+    }
+
+    Strides output_dims = m_dimensions;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = m_strides[0];
+      m_offsets[0] = startIndicesClamped[0];
+      Index previousDimProduct = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        previousDimProduct *= input_dims[i - 1];
+        m_inputStrides[i] = previousDimProduct * m_strides[i];
+        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+      }
+
+      // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * output_dims[i - 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+      }
+    } else {
+      m_inputStrides[NumDims - 1] = m_strides[NumDims - 1];
+      m_offsets[NumDims - 1] = startIndicesClamped[NumDims - 1];
+      Index previousDimProduct = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        previousDimProduct *= input_dims[i + 1];
+        m_inputStrides[i] = previousDimProduct * m_strides[i];
+        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+      }
+
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * output_dims[i + 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i >= 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+        index -= idx * m_outputStrides[i];
+      }
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+        index -= idx * m_outputStrides[i];
+      }
+    }
+    return inputIndex;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+#ifndef SYCL_DEVICE_ONLY
+    return numext::maxi(min, numext::mini(max, value));
+#else
+    return cl::sycl::clamp(value, min, max);
+#endif
+  }
+
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  bool m_is_identity;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
+  DSizes<Index, NumDims> m_startIndices;  // clamped startIndices
+  DSizes<Index, NumDims> m_dimensions;
+  DSizes<Index, NumDims> m_offsets;  // offset in a flattened shape
+  const Strides m_strides;
+};
+
+// Eval as lvalue
+template <typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+    : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base;
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+  static constexpr int NumDims = internal::array_size<Strides>::value;
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
new file mode 100644
index 00000000000..0e8f54a8052
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -0,0 +1,620 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorPadding
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor padding class.
+ * At the moment only padding with a constant value is supported.
+ *
+ */
+namespace internal {
+template <typename PaddingDimensions, typename XprType>
+struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename PaddingDimensions, typename XprType>
+struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense> {
+  typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
+};
+
+template <typename PaddingDimensions, typename XprType>
+struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1,
+              typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type> {
+  typedef TensorPaddingOp<PaddingDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename PaddingDimensions, typename XprType>
+class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims,
+                                                        const Scalar padding_value)
+      : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC const PaddingDimensions& padding() const { return m_padding_dims; }
+  EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const PaddingDimensions m_padding_dims;
+  const Scalar m_padding_value;
+};
+
+// Eval as rvalue
+template <typename PaddingDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device> {
+  typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<PaddingDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = true,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    CoordAccess = true,
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) {
+    // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
+    // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
+    // of 1 element first and then pad.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Compute dimensions
+    m_dimensions = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] += m_padding[i].first + m_padding[i].second;
+    }
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+      }
+      m_outputStrides[NumDims] = m_outputStrides[NumDims - 1] * m_dimensions[NumDims - 1];
+    } else {
+      m_inputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        m_outputStrides[i + 1] = m_outputStrides[i + 2] * m_dimensions[i + 1];
+      }
+      m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (isPaddingAtIndexForDim(idx, i)) {
+          return m_paddingValue;
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (isPaddingAtIndexForDim(index, 0)) {
+        return m_paddingValue;
+      }
+      inputIndex += (index - m_padding[0].first);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i + 1];
+        if (isPaddingAtIndexForDim(idx, i)) {
+          return m_paddingValue;
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i + 1];
+      }
+      if (isPaddingAtIndexForDim(index, NumDims - 1)) {
+        return m_paddingValue;
+      }
+      inputIndex += (index - m_padding[NumDims - 1].first);
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return packetColMajor(index);
+    }
+    return packetRowMajor(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    TensorOpCost cost = m_impl.costPerCoeff(vectorized);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims; ++i) updateCostPerDimension(cost, i, i == 0);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i >= 0; --i) updateCostPerDimension(cost, i, i == NumDims - 1);
+    }
+    return cost;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size), m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    // If one of the dimensions is zero, return empty block view.
+    if (desc.size() == 0) {
+      return TensorBlock(internal::TensorBlockKind::kView, NULL, desc.dimensions());
+    }
+
+    static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
+    const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
+
+    Index offset = desc.offset();
+
+    // Compute offsets in the output tensor corresponding to the desc.offset().
+    DSizes<Index, NumDims> output_offsets;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      const int stride_dim = IsColMajor ? dim : dim + 1;
+      output_offsets[dim] = offset / m_outputStrides[stride_dim];
+      offset -= output_offsets[dim] * m_outputStrides[stride_dim];
+    }
+    output_offsets[inner_dim_idx] = offset;
+
+    // Offsets in the input corresponding to output offsets.
+    DSizes<Index, NumDims> input_offsets = output_offsets;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
+    }
+
+    // Compute offset in the input buffer (at this point it might be illegal and
+    // point outside of the input buffer, because we don't check for negative
+    // offsets, it will be autocorrected in the block iteration loop below).
+    Index input_offset = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offset += input_offsets[dim] * m_inputStrides[dim];
+    }
+
+    // Destination buffer and scratch buffer both indexed from 0 and have the
+    // same dimensions as the requested block (for destination buffer this
+    // property is guaranteed by `desc.destination()`).
+    Index output_offset = 0;
+    const DSizes<Index, NumDims> output_strides = internal::strides<Layout>(desc.dimensions());
+
+    // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
+    // dimensions, skipping innermost dimension. In theory it should be possible
+    // to squeeze matching innermost dimensions, however in practice that did
+    // not show any improvements in benchmarks. Also in practice first outer
+    // dimension usually has padding, and will prevent squeezing.
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims - 1> it;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      it[i].count = 0;
+      it[i].size = desc.dimension(dim);
+
+      it[i].input_stride = m_inputStrides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      it[i].output_stride = output_strides[dim];
+      it[i].output_span = it[i].output_stride * (it[i].size - 1);
+    }
+
+    const Index input_inner_dim_size = static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
+
+    // Total output size.
+    const Index output_size = desc.size();
+
+    // We will fill inner dimension of this size in the output. It might be
+    // larger than the inner dimension in the input, so we might have to pad
+    // before/after we copy values from the input inner dimension.
+    const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
+
+    // How many values to fill with padding BEFORE reading from the input inner
+    // dimension.
+    const Index output_inner_pad_before_size =
+        input_offsets[inner_dim_idx] < 0
+            ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), output_inner_dim_size)
+            : 0;
+
+    // How many values we can actually copy from the input inner dimension.
+    const Index output_inner_copy_size = numext::mini(
+        // Want to copy from input.
+        (output_inner_dim_size - output_inner_pad_before_size),
+        // Can copy from input.
+        numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] + output_inner_pad_before_size), Index(0)));
+
+    eigen_assert(output_inner_copy_size >= 0);
+
+    // How many values to fill with padding AFTER reading from the input inner
+    // dimension.
+    const Index output_inner_pad_after_size =
+        (output_inner_dim_size - output_inner_copy_size - output_inner_pad_before_size);
+
+    // Sanity check, sum of all sizes must be equal to the output size.
+    eigen_assert(output_inner_dim_size ==
+                 (output_inner_pad_before_size + output_inner_copy_size + output_inner_pad_after_size));
+
+    // Keep track of current coordinates and padding in the output.
+    DSizes<Index, NumDims> output_coord = output_offsets;
+    DSizes<Index, NumDims> output_padded;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+    }
+
+    typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
+
+    // Prepare storage for the materialized padding result.
+    const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+
+    // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
+    // single logical inner dimension.
+
+    // When possible we squeeze writes for the innermost (only if non-padded)
+    // dimension with the first padded dimension. This allows to reduce the
+    // number of calls to LinCopy and better utilize vector instructions.
+    const bool squeeze_writes = NumDims > 1 &&
+                                // inner dimension is not padded
+                                (input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
+                                // and equal to the block inner dimension
+                                (input_inner_dim_size == output_inner_dim_size);
+
+    const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
+
+    // Maximum coordinate on a squeeze dimension that we can write to.
+    const Index squeeze_max_coord =
+        squeeze_writes ? numext::mini(
+                             // max non-padded element in the input
+                             static_cast<Index>(m_dimensions[squeeze_dim] - m_padding[squeeze_dim].second),
+                             // max element in the output buffer
+                             static_cast<Index>(output_offsets[squeeze_dim] + desc.dimension(squeeze_dim)))
+                       : static_cast<Index>(0);
+
+    // Iterate copying data from `m_impl.data()` to the output buffer.
+    for (Index size = 0; size < output_size;) {
+      // Detect if we are in the padded region (exclude innermost dimension).
+      bool is_padded = false;
+      for (int j = 1; j < NumDims; ++j) {
+        const int dim = IsColMajor ? j : NumDims - j - 1;
+        is_padded = output_padded[dim];
+        if (is_padded) break;
+      }
+
+      if (is_padded) {
+        // Fill single innermost dimension with padding value.
+        size += output_inner_dim_size;
+
+        LinCopy::template Run<LinCopy::Kind::FillLinear>(typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+                                                         typename LinCopy::Src(0, 0, &m_paddingValue),
+                                                         output_inner_dim_size);
+
+      } else if (squeeze_writes) {
+        // Squeeze multiple reads from innermost dimensions.
+        const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
+        size += output_inner_dim_size * squeeze_num;
+
+        // Copy `squeeze_num` inner dimensions from input to output.
+        LinCopy::template Run<LinCopy::Kind::Linear>(typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+                                                     typename LinCopy::Src(input_offset, 1, m_impl.data()),
+                                                     output_inner_dim_size * squeeze_num);
+
+        // Update iteration state for only `squeeze_num - 1` processed inner
+        // dimensions, because we have another iteration state update at the end
+        // of the loop that will update iteration state for the last inner
+        // processed dimension.
+        it[0].count += (squeeze_num - 1);
+        input_offset += it[0].input_stride * (squeeze_num - 1);
+        output_offset += it[0].output_stride * (squeeze_num - 1);
+        output_coord[squeeze_dim] += (squeeze_num - 1);
+
+      } else {
+        // Single read from innermost dimension.
+        size += output_inner_dim_size;
+
+        {  // Fill with padding before copying from input inner dimension.
+          const Index out = output_offset;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(typename LinCopy::Dst(out, 1, block_storage.data()),
+                                                           typename LinCopy::Src(0, 0, &m_paddingValue),
+                                                           output_inner_pad_before_size);
+        }
+
+        {  // Copy data from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size;
+          const Index in = input_offset + output_inner_pad_before_size;
+
+          eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
+
+          LinCopy::template Run<LinCopy::Kind::Linear>(typename LinCopy::Dst(out, 1, block_storage.data()),
+                                                       typename LinCopy::Src(in, 1, m_impl.data()),
+                                                       output_inner_copy_size);
+        }
+
+        {  // Fill with padding after copying from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size + output_inner_copy_size;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(typename LinCopy::Dst(out, 1, block_storage.data()),
+                                                           typename LinCopy::Src(0, 0, &m_paddingValue),
+                                                           output_inner_pad_after_size);
+        }
+      }
+
+      for (int j = 0; j < NumDims - 1; ++j) {
+        const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
+
+        if (++it[j].count < it[j].size) {
+          input_offset += it[j].input_stride;
+          output_offset += it[j].output_stride;
+          output_coord[dim] += 1;
+          output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+          break;
+        }
+        it[j].count = 0;
+        input_offset -= it[j].input_span;
+        output_offset -= it[j].output_span;
+        output_coord[dim] -= it[j].size - 1;
+        output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState() : count(0), size(0), input_stride(0), input_span(0), output_stride(0), output_span(0) {}
+
+    Index count;
+    Index size;
+    Index input_stride;
+    Index input_span;
+    Index output_stride;
+    Index output_span;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(Index index, int dim_index) const {
+    return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
+            index < m_padding[dim_index].first) ||
+           (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
+            index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(int dim_index) const {
+    return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(int dim_index) const {
+    return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
+  }
+
+  void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
+    const double in = static_cast<double>(m_impl.dimensions()[i]);
+    const double out = in + m_padding[i].first + m_padding[i].second;
+    if (out == 0) return;
+    const double reduction = in / out;
+    cost *= reduction;
+    if (first) {
+      cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() + reduction * (1 * TensorOpCost::AddCost<Index>()));
+    } else {
+      cost += TensorOpCost(0, 0,
+                           2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+                               reduction * (2 * TensorOpCost::MulCost<Index>() + 1 * TensorOpCost::DivCost<Index>()));
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    const Index initialIndex = index;
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index firstIdx = index;
+      const Index lastIdx = index + PacketSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
+      const Index lastPaddedRight = m_outputStrides[i + 1];
+
+      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      } else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      } else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) ||
+                 (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+        // all the coefficient are between the 2 padding zones.
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      } else {
+        // Every other case
+        return packetWithPossibleZero(initialIndex);
+      }
+    }
+
+    const Index lastIdx = index + PacketSize - 1;
+    const Index firstIdx = index;
+    const Index lastPaddedLeft = m_padding[0].first;
+    const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
+    const Index lastPaddedRight = m_outputStrides[1];
+
+    if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    } else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    } else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) ||
+               (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+      // all the coefficient are between the 2 padding zones.
+      inputIndex += (index - m_padding[0].first);
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    // Every other case
+    return packetWithPossibleZero(initialIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    const Index initialIndex = index;
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index firstIdx = index;
+      const Index lastIdx = index + PacketSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i + 1];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i + 1];
+      const Index lastPaddedRight = m_outputStrides[i];
+
+      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      } else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      } else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) ||
+                 (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+        // all the coefficient are between the 2 padding zones.
+        const Index idx = index / m_outputStrides[i + 1];
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i + 1];
+      } else {
+        // Every other case
+        return packetWithPossibleZero(initialIndex);
+      }
+    }
+
+    const Index lastIdx = index + PacketSize - 1;
+    const Index firstIdx = index;
+    const Index lastPaddedLeft = m_padding[NumDims - 1].first;
+    const Index firstPaddedRight = (m_dimensions[NumDims - 1] - m_padding[NumDims - 1].second);
+    const Index lastPaddedRight = m_outputStrides[NumDims - 1];
+
+    if (!isLeftPaddingCompileTimeZero(NumDims - 1) && lastIdx < lastPaddedLeft) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    } else if (!isRightPaddingCompileTimeZero(NumDims - 1) && firstIdx >= firstPaddedRight &&
+               lastIdx < lastPaddedRight) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    } else if ((isLeftPaddingCompileTimeZero(NumDims - 1) && isRightPaddingCompileTimeZero(NumDims - 1)) ||
+               (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+      // all the coefficient are between the 2 padding zones.
+      inputIndex += (index - m_padding[NumDims - 1].first);
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    // Every other case
+    return packetWithPossibleZero(initialIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const {
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims + 1> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  PaddingDimensions m_padding;
+
+  Scalar m_paddingValue;
+
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
new file mode 100644
index 00000000000..7b33f1fe19c
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -0,0 +1,260 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorPatch
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor patch class.
+ *
+ *
+ */
+namespace internal {
+template <typename PatchDim, typename XprType>
+struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions + 1;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename PatchDim, typename XprType>
+struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense> {
+  typedef const TensorPatchOp<PatchDim, XprType>& type;
+};
+
+template <typename PatchDim, typename XprType>
+struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type> {
+  typedef TensorPatchOp<PatchDim, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename PatchDim, typename XprType>
+class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
+      : m_xpr(expr), m_patch_dims(patch_dims) {}
+
+  EIGEN_DEVICE_FUNC const PatchDim& patch_dims() const { return m_patch_dims; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const PatchDim m_patch_dims;
+};
+
+// Eval as rvalue
+template <typename PatchDim, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> {
+  typedef TensorPatchOp<PatchDim, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {
+    Index num_patches = 1;
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const PatchDim& patch_dims = op.patch_dims();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        m_dimensions[i] = patch_dims[i];
+        num_patches *= (input_dims[i] - patch_dims[i] + 1);
+      }
+      m_dimensions[NumDims - 1] = num_patches;
+
+      m_inputStrides[0] = 1;
+      m_patchStrides[0] = 1;
+      for (int i = 1; i < NumDims - 1; ++i) {
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        m_patchStrides[i] = m_patchStrides[i - 1] * (input_dims[i - 1] - patch_dims[i - 1] + 1);
+      }
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        m_dimensions[i + 1] = patch_dims[i];
+        num_patches *= (input_dims[i] - patch_dims[i] + 1);
+      }
+      m_dimensions[0] = num_patches;
+
+      m_inputStrides[NumDims - 2] = 1;
+      m_patchStrides[NumDims - 2] = 1;
+      for (int i = NumDims - 3; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        m_patchStrides[i] = m_patchStrides[i + 1] * (input_dims[i + 1] - patch_dims[i + 1] + 1);
+      }
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
+    // Find the location of the first element of the patch.
+    Index patchIndex = index / m_outputStrides[output_stride_index];
+    // Find the offset of the element wrt the location of the first element.
+    Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 2; i > 0; --i) {
+        const Index patchIdx = patchIndex / m_patchStrides[i];
+        patchIndex -= patchIdx * m_patchStrides[i];
+        const Index offsetIdx = patchOffset / m_outputStrides[i];
+        patchOffset -= offsetIdx * m_outputStrides[i];
+        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+      }
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 2; ++i) {
+        const Index patchIdx = patchIndex / m_patchStrides[i];
+        patchIndex -= patchIdx * m_patchStrides[i];
+        const Index offsetIdx = patchOffset / m_outputStrides[i + 1];
+        patchOffset -= offsetIdx * m_outputStrides[i + 1];
+        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+      }
+    }
+    inputIndex += (patchIndex + patchOffset);
+    return m_impl.coeff(inputIndex);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
+    Index indices[2] = {index, index + PacketSize - 1};
+    Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
+                             indices[1] / m_outputStrides[output_stride_index]};
+    Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
+                             indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]};
+
+    Index inputIndices[2] = {0, 0};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 2; i > 0; --i) {
+        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]};
+        patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+        patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+        const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], patchOffsets[1] / m_outputStrides[i]};
+        patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
+        patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
+
+        inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+      }
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 2; ++i) {
+        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]};
+        patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+        patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+        const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i + 1], patchOffsets[1] / m_outputStrides[i + 1]};
+        patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i + 1];
+        patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i + 1];
+
+        inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+      }
+    }
+    inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
+    inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
+
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    } else {
+      EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[PacketSize - 1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize - 1; ++i) {
+        values[i] = coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() + TensorOpCost::MulCost<Index>() +
+                                           2 * TensorOpCost::AddCost<Index>());
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims - 1> m_inputStrides;
+  array<Index, NumDims - 1> m_patchStrides;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
new file mode 100644
index 00000000000..c9c613a1489
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -0,0 +1,309 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  // We don't support 3d kernels since we currently only use 1 and
+  // 2d kernels.
+  gpu_assert(threadIdx.z == 0);
+  return blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+#else
+  // Rely on Eigen's random implementation.
+  return random<uint64_t>();
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
+  // TODO: Unify with the implementation in the non blocking thread pool.
+  uint64_t current = *state;
+  // Update the internal state
+  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
+  // Generate the random output (using the PCG-XSH-RS scheme)
+  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+  seed = seed ? seed : get_random_seed();
+  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  return static_cast<T>(rnd);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
+  // Generate 10 random bits for the mantissa, merge with exponent.
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
+  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
+  // Return the final result
+  return result - Eigen::half(1.0f);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state,
+                                                                                           uint64_t stream) {
+  // Generate 7 random bits for the mantissa, merge with exponent.
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
+  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
+  // Return the final result
+  return result - Eigen::bfloat16(1.0f);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
+  typedef union {
+    uint32_t raw;
+    float fp;
+  } internal;
+  internal result;
+  // Generate 23 random bits for the mantissa mantissa
+  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  result.raw = rnd & 0x7fffffu;
+  // Set the exponent
+  result.raw |= (static_cast<uint32_t>(127) << 23);
+  // Return the final result
+  return result.fp - 1.0f;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
+  typedef union {
+    uint64_t raw;
+    double dp;
+  } internal;
+  internal result;
+  result.raw = 0;
+  // Generate 52 random bits for the mantissa
+  // First generate the upper 20 bits
+  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
+  // The generate the lower 32 bits
+  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
+  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+  // Set the exponent
+  result.raw |= (static_cast<uint64_t>(1023) << 52);
+  // Return the final result
+  return result.dp - 1.0;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state,
+                                                                                                    uint64_t stream) {
+  return std::complex<float>(RandomToTypeUniform<float>(state, stream), RandomToTypeUniform<float>(state, stream));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state,
+                                                                                                      uint64_t stream) {
+  return std::complex<double>(RandomToTypeUniform<double>(state, stream), RandomToTypeUniform<double>(state, stream));
+}
+
+template <typename T>
+class UniformRandomGenerator {
+ public:
+  static constexpr bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+#ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+    // Therefore, we need two steps to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is
+    // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each
+    // thread but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each
+    // thread adds the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the
+    // construction similar to CUDA Therefore, the thread Id injection is not available at this stage.
+    // However when the operator() is called the thread ID will be available. So inside the operator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    // to the seed and construct the unique m_state per thead similar to cuda.
+    m_exec_once = false;
+#endif
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator& other) {
+    m_state = other.m_state;
+#ifdef EIGEN_USE_SYCL
+    m_exec_once = other.m_exec_once;
+#endif
+  }
+
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
+#ifdef EIGEN_USE_SYCL
+    if (!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once = true;
+    }
+#endif
+    T result = RandomToTypeUniform<T>(&m_state, i);
+    return result;
+  }
+
+  template <typename Packet, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+#ifdef EIGEN_USE_SYCL
+    if (!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once = true;
+    }
+#endif
+    EIGEN_UNROLL_LOOP
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeUniform<T>(&m_state, i);
+    }
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+#ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+#endif
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+  enum {
+    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+    Cost = 12 * NumTraits<Scalar>::AddCost * ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
+  // Use the ratio of uniform method to generate numbers following a normal
+  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+  // details.
+  T u, v, q;
+  do {
+    u = RandomToTypeUniform<T>(state, stream);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
+    const T x = u - T(0.449871);
+    const T y = numext::abs(v) + T(0.386595);
+    q = x * x + y * (T(0.196) * y - T(0.25472) * x);
+  } while (q > T(0.27597) && (q > T(0.27846) || v * v > T(-4) * numext::log(u) * u * u));
+
+  return v / u;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state,
+                                                                                                   uint64_t stream) {
+  return std::complex<float>(RandomToTypeNormal<float>(state, stream), RandomToTypeNormal<float>(state, stream));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state,
+                                                                                                     uint64_t stream) {
+  return std::complex<double>(RandomToTypeNormal<double>(state, stream), RandomToTypeNormal<double>(state, stream));
+}
+
+template <typename T>
+class NormalRandomGenerator {
+ public:
+  static constexpr bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+#ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+    // Therefore, we need two steps to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is
+    // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Therefore, the thread Id injection is not available at this stage. However when the operator()
+    // is called the thread ID will be available. So inside the operator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    // to the seed and construct the unique m_state per thead similar to cuda.
+    m_exec_once = false;
+#endif
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator& other) {
+    m_state = other.m_state;
+#ifdef EIGEN_USE_SYCL
+    m_exec_once = other.m_exec_once;
+#endif
+  }
+
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
+#ifdef EIGEN_USE_SYCL
+    if (!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once = true;
+    }
+#endif
+    T result = RandomToTypeNormal<T>(&m_state, i);
+    return result;
+  }
+
+  template <typename Packet, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+#ifdef EIGEN_USE_SYCL
+    if (!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once = true;
+    }
+#endif
+    EIGEN_UNROLL_LOOP
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeNormal<T>(&m_state, i);
+    }
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+#ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+#endif
+};
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+  enum {
+    // On average, we need to generate about 3 random numbers
+    // 15 mul, 8 add, 1.5 logs
+    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost + 15 * NumTraits<Scalar>::AddCost +
+           8 * NumTraits<Scalar>::AddCost + 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
new file mode 100644
index 00000000000..2ecbb7c209e
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -0,0 +1,1038 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+
+// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
+// so we'll use a macro to make clang happy.
+#ifndef KERNEL_FRIEND
+#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
+#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
+#else
+#define KERNEL_FRIEND friend
+#endif
+#endif
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorReduction
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reduction class.
+ *
+ */
+
+namespace internal {
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> > : traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar Scalar;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+
+  template <class T>
+  struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+};
+
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense> {
+  typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
+};
+
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1,
+              typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type> {
+  typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
+};
+
+template <typename OutputDims>
+struct DimInitializer {
+  template <typename InputDims, typename ReducedDims>
+  EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims,
+                                    const array<bool, internal::array_size<InputDims>::value>& reduced,
+                                    OutputDims* output_dims, ReducedDims* reduced_dims) {
+    const int NumInputDims = internal::array_size<InputDims>::value;
+    int outputIndex = 0;
+    int reduceIndex = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (reduced[i]) {
+        (*reduced_dims)[reduceIndex] = input_dims[i];
+        ++reduceIndex;
+      } else {
+        (*output_dims)[outputIndex] = input_dims[i];
+        ++outputIndex;
+      }
+    }
+  }
+};
+
+template <>
+struct DimInitializer<Sizes<> > {
+  template <typename InputDims, typename Index, size_t Rank>
+  EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims, const array<bool, Rank>&, Sizes<>*,
+                                    array<Index, Rank>* reduced_dims) {
+    const int NumInputDims = internal::array_size<InputDims>::value;
+    for (int i = 0; i < NumInputDims; ++i) {
+      (*reduced_dims)[i] = input_dims[i];
+    }
+  }
+};
+
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct are_inner_most_dims {
+  static const bool value = false;
+};
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct preserve_inner_most_dims {
+  static const bool value = false;
+};
+
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor> {
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
+  static const bool tmp3 =
+      index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, array_size<ReducedDims>::value - 1);
+  static const bool value = tmp1 & tmp2 & tmp3;
+};
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor> {
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
+  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool value = tmp1 & tmp2 & tmp3;
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor> {
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
+  static const bool value = tmp1 & tmp2;
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor> {
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool value = tmp1 & tmp2;
+};
+
+template <int DimIndex, typename Self, typename Op>
+struct GenericDimReducer {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
+                                                           Op& reducer, typename Self::CoeffReturnType* accum) {
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+      GenericDimReducer<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<0, Self, Op> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
+                                                           Op& reducer, typename Self::CoeffReturnType* accum) {
+    for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+      reducer.reduce(self.m_impl.coeff(input), accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<-1, Self, Op> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer,
+                                                           typename Self::CoeffReturnType* accum) {
+    reducer.reduce(self.m_impl.coeff(index), accum);
+  }
+};
+
+template <typename Self, typename Op,
+          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
+          bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && !Self::ReducerTraits::IsExactlyAssociative &&
+                                   // GPU threads can quickly run out of stack space
+                                   // for moderately sized inputs.
+                                   !Self::RunningOnGPU)>
+struct InnerMostDimReducer {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
+      const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer.finalize(accum);
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true, false> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
+      const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) {
+    using Index = typename Self::Index;
+    constexpr Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    Index start = 0;
+    typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+    if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4 * packetSize) {
+      const Index VectorizedSize4 = (numValuesToReduce / (4 * packetSize)) * (4 * packetSize);
+      typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType paccum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      const Index offset0 = firstIndex;
+      const Index offset1 = firstIndex + packetSize;
+      const Index offset2 = firstIndex + 2 * packetSize;
+      const Index offset3 = firstIndex + 3 * packetSize;
+      for (Index j = 0; j < VectorizedSize4; j += 4 * packetSize) {
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset0 + j), &paccum0);
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset1 + j), &paccum1);
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset2 + j), &paccum2);
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset3 + j), &paccum3);
+      }
+      reducer0.reducePacket(paccum1, &paccum0);
+      reducer0.reducePacket(paccum2, &paccum0);
+      reducer0.reducePacket(paccum3, &paccum0);
+      start = VectorizedSize4;
+    }
+    if (start <= (numValuesToReduce - packetSize)) {
+      const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
+      for (Index j = start; j < VectorizedSize; j += packetSize) {
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum0);
+      }
+      start = VectorizedSize;
+    }
+    typename Self::CoeffReturnType accum = reducer0.initialize();
+    for (Index j = start; j < numValuesToReduce; ++j) {
+      reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer0.finalizeBoth(accum, paccum0);
+  }
+};
+
+#if !defined(EIGEN_HIPCC)
+
+// The following implements tree-based reduction, which improves the accuracy
+// of sum and mean reductions, since each of the n inputs only participates in
+// O(log n) additions.
+template <typename T>
+EIGEN_DEVICE_FUNC inline Index LeafSize() {
+  return 1024;
+}
+template <>
+EIGEN_DEVICE_FUNC inline Index LeafSize<half>() {
+  return 200;
+}
+template <>
+EIGEN_DEVICE_FUNC inline Index LeafSize<bfloat16>() {
+  return 128;
+}
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, false, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
+      const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    if (numValuesToReduce > kLeafSize) {
+      const typename Self::Index half = numValuesToReduce / 2;
+      // Recursively reduce the two halves.
+      reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
+      reducer.reduce(reduce(self, firstIndex + half, numValuesToReduce - half, reducer), &accum);
+      return reducer.finalize(accum);
+    } else {
+      return InnerMostDimReducer<Self, Op, false, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
+    }
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
+      const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
+    const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    if (numValuesToReduce > packetSize * kLeafSize) {
+      // Make sure the split point is aligned on a packet boundary.
+      const typename Self::Index split =
+          packetSize *
+          numext::div_ceil(firstIndex + numext::div_ceil(numValuesToReduce, typename Self::Index(2)), packetSize);
+      const typename Self::Index num_left = numext::mini(split - firstIndex, numValuesToReduce);
+      reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
+      if (num_left < numValuesToReduce) {
+        reducer.reduce(reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
+      }
+      return reducer.finalize(accum);
+    } else {
+      return InnerMostDimReducer<Self, Op, true, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
+    }
+  }
+};
+#endif
+
+template <int DimIndex, typename Self, typename Op,
+          bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+struct InnerMostDimPreserver {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&,
+                                                           typename Self::PacketReturnType*) {
+    eigen_assert(false && "should never be called");
+  }
+};
+
+template <int DimIndex, typename Self, typename Op>
+struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
+                                                           Op& reducer, typename Self::PacketReturnType* accum) {
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+      InnerMostDimPreserver<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
+    }
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<0, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
+                                                           Op& reducer0, typename Self::PacketReturnType* accum0) {
+    using Index = typename Self::Index;
+    const Index stride = self.m_reducedStrides[0];
+    const Index size = self.m_reducedDims[0];
+    if (!Self::ReducerTraits::IsStateful && size >= 16) {
+      const Index unrolled_size4 = (size / 4) * 4;
+      typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType accum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      for (Index j = 0; j < unrolled_size4; j += 4) {
+        const Index input0 = firstIndex + j * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input0), accum0);
+        const Index input1 = firstIndex + (j + 1) * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input1), &accum1);
+        const Index input2 = firstIndex + (j + 2) * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input2), &accum2);
+        const Index input3 = firstIndex + (j + 3) * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input3), &accum3);
+      }
+      reducer0.reducePacket(accum1, accum0);
+      reducer0.reducePacket(accum2, accum0);
+      reducer0.reducePacket(accum3, accum0);
+      for (Index j = unrolled_size4; j < size; ++j) {
+        Index input = firstIndex + j * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
+      }
+    } else {
+      for (Index j = 0; j < size; ++j) {
+        Index input = firstIndex + j * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
+      }
+    }
+  }
+};
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<-1, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&,
+                                                           typename Self::PacketReturnType*) {
+    eigen_assert(false && "should never be called");
+  }
+};
+
+// Default full reducer
+template <typename Self, typename Op, typename Device,
+          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+struct FullReducer {
+  static constexpr bool HasOptimizedImplementation = false;
+
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&,
+                                    typename Self::EvaluatorPointerType output) {
+    const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
+  }
+};
+
+#ifdef EIGEN_USE_THREADS
+// Multithreaded full reducers
+template <typename Self, typename Op,
+          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+struct FullReducerShard {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
+                                                        typename Self::Index numValuesToReduce, Op& reducer,
+                                                        typename Self::CoeffReturnType* output) {
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, firstIndex, numValuesToReduce, reducer);
+  }
+};
+
+// Multithreaded full reducer
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
+  static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
+  static constexpr Index PacketSize = unpacket_traits<typename Self::PacketReturnType>::size;
+
+  // launch one reducer per thread and accumulate the result.
+  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
+                  typename Self::CoeffReturnType* output) {
+    typedef typename Self::Index Index;
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    if (num_coeffs == 0) {
+      *output = reducer.finalize(reducer.initialize());
+      return;
+    }
+    const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) +
+                              TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, PacketSize);
+    const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(num_coeffs, cost, device.numThreads());
+    if (num_threads == 1) {
+      *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
+      return;
+    }
+    const Index blocksize = num_coeffs / num_threads;
+    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    eigen_assert(num_coeffs >= numblocks * blocksize);
+
+    Barrier barrier(internal::convert_index<unsigned int>(numblocks));
+    MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
+    for (Index i = 0; i < numblocks; ++i) {
+      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, self, i * blocksize,
+                                  blocksize, reducer, &shards[i]);
+    }
+    typename Self::CoeffReturnType finalShard;
+    if (numblocks * blocksize < num_coeffs) {
+      finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, numblocks * blocksize,
+                                                                       num_coeffs - numblocks * blocksize, reducer);
+    } else {
+      finalShard = reducer.initialize();
+    }
+    barrier.Wait();
+
+    for (Index i = 0; i < numblocks; ++i) {
+      reducer.reduce(shards[i], &finalShard);
+    }
+    *output = reducer.finalize(finalShard);
+  }
+};
+
+#endif
+
+// Default inner reducer
+template <typename Self, typename Op, typename Device>
+struct InnerReducer {
+  static constexpr bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
+                                    typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+
+// Default outer reducer
+template <typename Self, typename Op, typename Device>
+struct OuterReducer {
+  static constexpr bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
+                                    typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+
+#ifdef EIGEN_USE_SYCL
+// Default Generic reducer
+template <typename Self, typename Op, typename Device>
+struct GenericReducer {
+  static constexpr bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
+                                    typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+#endif
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
+                                                                 unsigned int*);
+
+#if defined(EIGEN_HAS_GPU_FP16)
+template <typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
+    R, const S, I_, internal::packet_traits<half>::type*);
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*,
+                                                                          internal::packet_traits<half>::type*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+
+#endif
+
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+#endif
+
+/**
+ * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
+ * This allows the reduction to have a different type for the accumulator than the input data type.
+ * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
+ * with the accumulator and the other for reducing two accumulators.
+ * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
+ * some properties of the input.
+ */
+template <typename Op, typename CoeffReturnType>
+struct ReductionReturnType {
+#if defined(EIGEN_USE_SYCL)
+  typedef std::remove_const_t<decltype(std::declval<Op>().initialize())> type;
+#else
+  typedef std::remove_const_t<CoeffReturnType> type;
+#endif
+};
+
+}  // end namespace internal
+
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims)
+      : m_expr(expr), m_dims(dims) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer)
+      : m_expr(expr), m_dims(dims), m_reducer(reducer) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& expression() const { return m_expr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& reducer() const { return m_reducer; }
+
+ protected:
+  typename XprType::Nested m_expr;
+  const Dims m_dims;
+  const Op m_reducer;
+};
+
+template <typename ArgType, typename Device>
+struct TensorReductionEvaluatorBase;
+
+// Eval as rvalue
+template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
+  typedef internal::reducer_traits<Op, Device> ReducerTraits;
+  typedef Dims ReducedDims;
+  typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
+  typedef typename XprType::Index Index;
+  typedef ArgType ChildType;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  static constexpr int NumInputDims = internal::array_size<InputDimensions>::value;
+  static constexpr int NumReducedDims = internal::array_size<Dims>::value;
+  static constexpr int NumOutputDims = NumInputDims - NumReducedDims;
+  typedef std::conditional_t<NumOutputDims == 0, Sizes<>, DSizes<Index, NumOutputDims> > Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
+  static constexpr bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
+  typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr Index PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  // Subset of strides of the input tensor for the non-reduced dimensions.
+  // Indexed by output dimensions.
+  static constexpr int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+
+  // For full reductions
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+  static constexpr bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
+  static constexpr bool RunningOnSycl = false;
+#elif defined(EIGEN_USE_SYCL)
+  static constexpr bool RunningOnSycl = internal::is_same<internal::remove_all_t<Device>, Eigen::SyclDevice>::value;
+  static constexpr bool RunningOnGPU = false;
+#else
+  static constexpr bool RunningOnGPU = false;
+  static constexpr bool RunningOnSycl = false;
+#endif
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = true,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  static constexpr bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
+  static constexpr bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
+  static constexpr bool RunningFullReduction = (NumOutputDims == 0);
+
+  EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) {
+    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Build the bitmap indicating if an input dimension is reduced or not.
+    for (int i = 0; i < NumInputDims; ++i) {
+      m_reduced[i] = false;
+    }
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op.dims()[i] >= 0);
+      eigen_assert(op.dims()[i] < NumInputDims);
+      m_reduced[op.dims()[i]] = true;
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
+
+    // Precompute output strides.
+    if (NumOutputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        }
+      } else {
+        m_outputStrides[static_cast<size_t>(NumOutputDims - 1)] = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        }
+      }
+    }
+
+    // Precompute input strides.
+    if (NumInputDims > 0) {
+      array<Index, NumInputDims> input_strides;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        input_strides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
+        }
+      } else {
+        input_strides.back() = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      int outputIndex = 0;
+      int reduceIndex = 0;
+      for (int i = 0; i < NumInputDims; ++i) {
+        if (m_reduced[i]) {
+          m_reducedStrides[reduceIndex] = input_strides[i];
+          ++reduceIndex;
+        } else {
+          m_preservedStrides[outputIndex] = input_strides[i];
+          m_output_to_input_dim_map[outputIndex] = i;
+          ++outputIndex;
+        }
+      }
+    }
+
+    // Special case for full reductions
+    if (NumOutputDims == 0) {
+      m_preservedStrides[0] = internal::array_prod(input_dims);
+    }
+
+    m_numValuesToReduce = NumOutputDims == 0 ? internal::array_prod(input_dims)
+                          : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                              ? m_preservedStrides[0]
+                              : m_preservedStrides[static_cast<size_t>(NumOutputDims - 1)];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
+    // Use the FullReducer if possible.
+    if ((RunningFullReduction && RunningOnSycl) ||
+        (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || !RunningOnGPU))) {
+      bool need_assign = false;
+      if (!data) {
+        m_result = static_cast<EvaluatorPointerType>(
+            m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
+        data = m_result;
+        need_assign = true;
+      }
+      Op reducer(m_reducer);
+      internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
+      return need_assign;
+    }
+
+    // Attempt to use an optimized reduction.
+    else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
+      bool reducing_inner_dims = true;
+      for (int i = 0; i < NumReducedDims; ++i) {
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          reducing_inner_dims &= m_reduced[i];
+        } else {
+          reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
+        }
+      }
+      if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
+          (reducing_inner_dims || ReducingInnerMostDims)) {
+        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
+               num_values_to_reduce > 128) ||
+              (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get(
+                (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+            m_result = data;
+          } else {
+            return true;
+          }
+        }
+        Op reducer(m_reducer);
+        // For SYCL this if always return false
+        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
+                                                          num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate_temp(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
+      }
+
+      bool preserving_inner_dims = true;
+      for (int i = 0; i < NumReducedDims; ++i) {
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
+        } else {
+          preserving_inner_dims &= m_reduced[i];
+        }
+      }
+      if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation && preserving_inner_dims) {
+        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
+               num_values_to_reduce > 32) ||
+              (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get(
+                (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+            m_result = data;
+          } else {
+            return true;
+          }
+        }
+        Op reducer(m_reducer);
+        // For SYCL this if always return false
+        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
+                                                          num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate_temp(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
+      }
+#if defined(EIGEN_USE_SYCL)
+      // If there is no Optimised version for SYCL, the reduction expression
+      // must break into two subexpression and use the SYCL generic Reducer on the device.
+      if (RunningOnSycl) {
+        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          data = static_cast<EvaluatorPointerType>(
+              m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+          m_result = data;
+        }
+        Op reducer(m_reducer);
+        internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
+                                                        num_coeffs_to_preserve);
+        return (m_result != NULL);
+      }
+#endif
+    }
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { done(evalSubExprsIfNeededCommon(data)); });
+  }
+#endif
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return evalSubExprsIfNeededCommon(data);
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+    if (m_result) {
+      m_device.deallocate_temp(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    if ((RunningFullReduction || RunningOnGPU) && m_result) {
+      return *(m_result + index);
+    }
+    Op reducer(m_reducer);
+    if (ReducingInnerMostDims || RunningFullReduction) {
+      const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                                             ? m_preservedStrides[0]
+                                             : m_preservedStrides[NumPreservedStrides - 1];
+      return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index), num_values_to_reduce, reducer);
+    } else {
+      typename Self::CoeffReturnType accum = reducer.initialize();
+      internal::GenericDimReducer<NumReducedDims - 1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
+      return reducer.finalize(accum);
+    }
+  }
+
+  // TODO(bsteiner): provide a more efficient implementation.
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
+
+    if (RunningOnGPU && m_result) {
+      return internal::pload<PacketReturnType>(m_result + index);
+    }
+
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    if (ReducingInnerMostDims) {
+      const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                                             ? m_preservedStrides[0]
+                                             : m_preservedStrides[NumPreservedStrides - 1];
+      const Index firstIndex = firstInput(index);
+      for (Index i = 0; i < PacketSize; ++i) {
+        Op reducer(m_reducer);
+        values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
+                                                                    num_values_to_reduce, reducer);
+      }
+    } else if (PreservingInnerMostDims) {
+      const Index firstIndex = firstInput(index);
+      const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
+      // TBD: extend this the the n innermost dimensions that we preserve.
+      if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
+        Op reducer(m_reducer);
+        typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
+        internal::InnerMostDimPreserver<NumReducedDims - 1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
+        return reducer.finalizePacket(accum);
+      } else {
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = coeff(index + i);
+        }
+      }
+    } else {
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = coeff(index + i);
+      }
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  // Must be called after evalSubExprsIfNeeded().
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    if (RunningFullReduction && m_result) {
+      return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+    } else {
+      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+      const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
+      return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
+             TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
+
+ private:
+  template <int, typename, typename>
+  friend struct internal::GenericDimReducer;
+  template <typename, typename, bool, bool>
+  friend struct internal::InnerMostDimReducer;
+  template <int, typename, typename, bool>
+  friend struct internal::InnerMostDimPreserver;
+  template <typename S, typename O, typename D, bool V>
+  friend struct internal::FullReducer;
+#ifdef EIGEN_USE_THREADS
+  template <typename S, typename O, bool V>
+  friend struct internal::FullReducerShard;
+#endif
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+  template <int B, int N, typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+#if defined(EIGEN_HAS_GPU_FP16)
+  template <typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
+                                                                     internal::packet_traits<Eigen::half>::type*);
+  template <int B, int N, typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*,
+                                                            internal::packet_traits<Eigen::half>::type*);
+  template <int NPT, typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+#endif
+  template <int NPT, typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+
+  template <int NPT, typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+  template <typename Evaluator_, typename Op__>
+  friend class TensorSycl::internal::GenericNondeterministicReducer;
+  // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
+  template <typename, typename, typename>
+  friend struct internal::GenericReducer;
+#endif
+
+  template <typename S, typename O, typename D>
+  friend struct internal::InnerReducer;
+
+  struct BlockIteratorState {
+    Index input_dim;
+    Index output_size;
+    Index output_count;
+  };
+
+  // Returns the Index in the input tensor of the first value that needs to be
+  // used to compute the reduction at output index "index".
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    if (ReducingInnerMostDims) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        return index * m_preservedStrides[0];
+      } else {
+        return index * m_preservedStrides[NumPreservedStrides - 1];
+      }
+    }
+    // TBD: optimize the case where we preserve the innermost dimensions.
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        // This is index_i in the output tensor.
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (PreservingInnerMostDims) {
+        eigen_assert(m_preservedStrides[0] == 1);
+        startInput += index;
+      } else {
+        startInput += index * m_preservedStrides[0];
+      }
+    } else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        // This is index_i in the output tensor.
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (PreservingInnerMostDims) {
+        eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
+        startInput += index;
+      } else {
+        startInput += index * m_preservedStrides[NumPreservedStrides - 1];
+      }
+    }
+    return startInput;
+  }
+
+  // Bitmap indicating if an input dimension is reduced or not.
+  array<bool, NumInputDims> m_reduced;
+  // Dimensions of the output of the operation.
+  Dimensions m_dimensions;
+  // Precomputed strides for the output tensor.
+  // Avoid zero-sized arrays, since element access fails to compile on GPU.
+  array<Index, (std::max)(NumOutputDims, 1)> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, (std::max)(NumOutputDims, 1)> m_fastOutputStrides;
+  array<Index, (std::max)(NumPreservedStrides, 1)> m_preservedStrides;
+  // Map from output to input dimension index.
+  array<Index, (std::max)(NumOutputDims, 1)> m_output_to_input_dim_map;
+  // How many values go into each reduction
+  Index m_numValuesToReduce;
+
+  // Subset of strides of the input tensor for the reduced dimensions.
+  // Indexed by reduced dimensions.
+  array<Index, NumReducedDims> m_reducedStrides;
+  // Size of the input dimensions that are reduced.
+  // Indexed by reduced dimensions.
+  array<Index, NumReducedDims> m_reducedDims;
+
+  // Evaluator for the input expression.
+  TensorEvaluator<ArgType, Device> m_impl;
+
+  // Operation to apply for computing the reduction.
+  Op m_reducer;
+
+  EvaluatorPointerType m_result;
+
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+    : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device) {}
+};
+
+template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
+    : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
+      Base;
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device)
+      : Base(op, device) {}
+  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
+  // kernel
+  // Therefore the coeff function should be overridden by for SYCL kernel
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
+    return *(this->data() + index);
+  }
+  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
+  // kernel
+  // Therefore the packet function should be overridden by for SYCL kernel
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
+    return internal::pload<typename Base::PacketReturnType>(this->data() + index);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
new file mode 100644
index 00000000000..c5273e9b82d
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -0,0 +1,958 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple gpu thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another gpu thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  if (sizeof(T) == 4) {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  } else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS(reinterpret_cast<unsigned long long*>(output), oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  } else {
+    gpu_assert(0 && "Wordsize not supported");
+  }
+#else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(accum);
+  EIGEN_UNUSED_VARIABLE(reducer);
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+#ifdef EIGEN_GPU_COMPILE_PHASE
+// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
+template <typename R>
+__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
+  half2* houtput = reinterpret_cast<half2*>(output);
+  half2* haccum = reinterpret_cast<half2*>(&accum);
+  for (int i = 0; i < 4; ++i) {
+    atomicReduce(houtput + i, *(haccum + i), reducer);
+  }
+}
+#endif  // EIGEN_GPU_COMPILE_PHASE
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  atomicAdd(output, accum);
+#else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(accum);
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
+}
+
+template <typename CoeffType, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs,
+                                                                 CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                                                 typename Self::CoeffReturnType* output,
+                                                                 unsigned int* semaphore) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  // Initialize the output value
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  } else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      } else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        } while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread * BlockSize);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+#if defined(EIGEN_HIPCC)
+    // use std::is_floating_point to determine the type of reduced_val
+    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
+    // and list the float and int versions of __shfl_down as the candidate functions.
+    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
+    } else {
+      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
+    }
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+#else
+    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+#endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+#if defined(EIGEN_HIPCC)
+    __threadfence_system();
+#endif
+  }
+#else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(reducer);
+  EIGEN_UNUSED_VARIABLE(input);
+  EIGEN_UNUSED_VARIABLE(num_coeffs);
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(semaphore);
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input,
+                                                                                   Index num_coeffs, half* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  Index packet_remainder = num_coeffs % Index(unpacket_traits<packet_type>::size);
+  if (packet_remainder != 0) {
+    half2* h2scratch = reinterpret_cast<half2*>(scratch);
+    for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
+      *h2scratch = __halves2half2(input.coeff(i), input.coeff(i + 1));
+      h2scratch++;
+    }
+    if ((num_coeffs & 1) != 0) {
+      half lastCoeff = input.coeff(num_coeffs - 1);
+      *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
+    }
+  } else {
+    packet_type reduce = reducer.template initializePacket<packet_type>();
+    internal::pstoreu(scratch, reduce);
+  }
+}
+
+template <typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/,
+                                                                          Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+
+  const Index num_packets = num_coeffs / Index(unpacket_traits<PacketType>::size);
+  PacketType* p_output = reinterpret_cast<PacketType*>(output);
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    p_output[i] = reducer.template initializePacket<PacketType>();
+  }
+  Index packet_remainder = num_coeffs % Index(unpacket_traits<PacketType>::size);
+  if (thread_id < packet_remainder) {
+    output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
+                                                                          Index num_coeffs, half* output,
+                                                                          half* scratch) {
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  eigen_assert(NumPerThread % packet_width == 0);
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      int rem = num_coeffs % packet_width;
+      if (rem != 0) {
+        half2* p_scratch = reinterpret_cast<half2*>(scratch);
+        pstoreu(scratch, reducer.template initializePacket<PacketType>());
+        for (int i = 0; i < rem / 2; i++) {
+          *p_scratch = __halves2half2(input.coeff(num_coeffs - packet_width + 2 * i),
+                                      input.coeff(num_coeffs - packet_width + 2 * i + 1));
+          p_scratch++;
+        }
+        if ((num_coeffs & 1) != 0) {
+          half last = input.coeff(num_coeffs - 1);
+          *p_scratch = __halves2half2(last, reducer.initialize());
+        }
+      } else {
+        PacketType reduce = reducer.template initializePacket<PacketType>();
+        pstoreu(scratch, reduce);
+      }
+    }
+    __syncthreads();
+  }
+
+  PacketType accum = reducer.template initializePacket<PacketType>();
+  const Index max_iter =
+      numext::mini<Index>((num_coeffs - first_index) / packet_width, NumPerThread * BlockSize / packet_width);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + packet_width * i;
+    eigen_assert(index + packet_width < num_coeffs);
+    PacketType val = input.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+#if defined(EIGEN_HIPCC)
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+      union {
+        int i;
+        half2 h;
+      } wka_in, wka_out;
+      wka_in.h = hacc[i];
+      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+      hr[i] = wka_out.h;
+    }
+    reducer.reducePacket(r1, &accum);
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down(hacc[i], offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+#else
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+
+#endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(reinterpret_cast<PacketType*>(scratch), accum, reducer);
+  }
+
+  __syncthreads();
+  half2* rv1 = reinterpret_cast<half2*>(scratch);
+  if (packet_width > 2) {
+    reducer.reducePacket(rv1[2], rv1);
+    reducer.reducePacket(rv1[3], rv1 + 1);
+    reducer.reducePacket(rv1[1], rv1);
+  }
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      half tmp = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &tmp);
+      *output = tmp;
+    }
+  }
+}
+
+template <typename Op>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  if (unpacket_traits<packet_type>::size == 1) {
+    *output = *scratch;
+  } else {
+    half2* pscratch = reinterpret_cast<half2*>(scratch);
+    half tmp = __float2half(0.f);
+    for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
+      reducer.reduce(__low2half(*pscratch), &tmp);
+      reducer.reduce(__high2half(*pscratch), &tmp);
+      pscratch++;
+    }
+    *output = tmp;
+  }
+}
+
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    gpu_assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
+                     void>> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
+                  typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
+                      device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output,
+                  typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+    half* scratch = static_cast<half*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
+                        num_coeffs, scratch);
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), num_blocks,
+                      block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>), 1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_GPU_FP16
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value ||
+                                           (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
+                                            reducer_traits<Op, GpuDevice>::PacketAccess));
+#else   // EIGEN_HAS_GPU_FP16
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif  // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device,
+                                                                                                  output, num_coeffs);
+  }
+};
+
+template <int NumPerThread, typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input,
+                                                                  Index num_coeffs_to_reduce,
+                                                                  Index num_preserved_coeffs,
+                                                                  typename Self::CoeffReturnType* output) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+#if defined(EIGEN_HIPCC)
+        // use std::is_floating_point to determine the type of reduced_val
+        // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
+        // and list the float and int versions of __shfl_down as the candidate functions.
+        if (std::is_floating_point<Type>::value) {
+          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+        } else {
+          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+        }
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+#else
+        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+#endif
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else   // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+
+template <int NumPerThread, typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
+                                                                           Index num_coeffs_to_reduce,
+                                                                           Index num_preserved_coeffs, half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  const int unroll_times = 16 / packet_width;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = packet_width * thread_id;
+    for (; i + packet_width <= num_preserved_coeffs; i += packet_width * num_threads) {
+      PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
+      *poutput = reducer.template initializePacket<PacketType>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);  // everybody takes 2 rows
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
+      PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + packet_width <= num_coeffs_to_reduce; col += blockDim.x) {
+            const PacketType val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const PacketType val2 = input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            PacketType r1 = reducer.template initializePacket<PacketType>();
+            PacketType r2 = reducer.template initializePacket<PacketType>();
+            half2* hr1 = reinterpret_cast<half2*>(&r1);
+            half2* hr2 = reinterpret_cast<half2*>(&r2);
+            while (col + 1 < num_coeffs_to_reduce) {
+              *hr1 = __halves2half2(input.m_impl.coeff(row * num_coeffs_to_reduce + col),
+                                    input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
+              *hr2 = __halves2half2(input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
+                                    input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + 1));
+              hr1++;
+              hr2++;
+              col += 2;
+            }
+            if (col < num_coeffs_to_reduce) {
+              // Peel;
+              const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+              *hr1 = __halves2half2(last1, reducer.initialize());
+              const half last2 = input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
+              *hr2 = __halves2half2(last2, reducer.initialize());
+            }
+            reducer.reducePacket(r1, &reduced_val1);
+            reducer.reducePacket(r2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * packet_width;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col),
+                                 &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col),
+                                 &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+#if defined(EIGEN_HIPCC)
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+          union {
+            int i;
+            half2 h;
+          } wka_in1, wka_out1;
+          wka_in1.h = rv1[i];
+          wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+          hr1[i] = wka_out1.h;
+
+          union {
+            int i;
+            half2 h;
+          } wka_in2, wka_out2;
+          wka_in2.h = rv2[i];
+          wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+          hr2[i] = wka_out2.h;
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
+          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+#else
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int j = 0; j < packet_width / 2; j++) {
+          hr1[j] = __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize);
+          hr2[j] = __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+
+#endif
+      }
+      half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+      half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+      half2 val;
+      if (packet_width > 2) {
+        reducer.reducePacket(rv1[2], rv1);
+        reducer.reducePacket(rv1[3], rv1 + 1);
+        reducer.reducePacket(rv1[1], rv1);
+        reducer.reducePacket(rv2[2], rv2);
+        reducer.reducePacket(rv2[3], rv2 + 1);
+        reducer.reducePacket(rv2[1], rv2);
+      }
+      half val1 = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &val1);
+      half val2 = __low2half(*rv2);
+      reducer.reduce(__high2half(*rv2), &val2);
+      val = __halves2half2(val1, val2);
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce(reinterpret_cast<half2*>(loc), val, reducer);
+      }
+    }
+  }
+}
+
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index,
+                                    typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
+                     void>> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
+                  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
+      const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
+                        num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
+                      reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output,
+                  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/ 128;
+    const int num_per_thread = /*128*/ 64;
+    const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
+                        num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
+                      device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_GPU_FP16
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value ||
+                                           (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
+                                            reducer_traits<Op, GpuDevice>::PacketAccess));
+#else   // EIGEN_HAS_GPU_FP16
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif  // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
+                  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(
+        self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input,
+                                                                  Index num_coeffs_to_reduce,
+                                                                  Index num_preserved_coeffs,
+                                                                  typename Self::CoeffReturnType* output) {
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * numext::div_ceil<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static
+#if !defined(EIGEN_HIPCC)
+      // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
+      //          (in the cxx11_tensor_reduction_gpu test)
+      //
+      // terminate called after throwing an instance of 'std::runtime_error'
+      //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
+      //
+      // don't know why this happens (and why is it a runtime error instead of a compile time error)
+      //
+      // this will be fixed by HIP PR#457
+      EIGEN_DEVICE_FUNC
+#endif
+      bool
+      run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output,
+                  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
+      const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
+                        num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
+                      reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif  // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
new file mode 100644
index 00000000000..6944c033708
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -0,0 +1,588 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorReductionSycl.h
+ *
+ * \brief:
+ *  This is the specialization of the reduction operation. Two phase reduction approach
+ * is used since the GPU does not have Global Synchronization for global memory among
+ * different work-group/thread block. To solve the problem, we need to create two kernels
+ * to reduce the data, where the first kernel reduce the data locally and each local
+ * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
+ * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
+ * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
+ * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+ *
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
+struct OpDefiner {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
+  typedef Op type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &) {
+    return accumulator;
+  }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
+                                                                           const Index &scale) {
+    ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
+    return quotient_op(accumulator, CoeffReturnType(scale));
+  }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &scale) {
+    return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
+  }
+};
+
+template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
+          Index local_range>
+struct SecondStepFullReducer {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
+  typedef typename OpDef::type Op;
+  LocalAccessor scratch;
+  InputAccessor aI;
+  OutputAccessor outAcc;
+  Op op;
+  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
+      : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) const {
+    // Our empirical research shows that the best performance will be achieved
+    // when there is only one element per thread to reduce in the second step.
+    // in this step the second step reduction time is almost negligible.
+    // Hence, in the second step of reduction the input size is fixed to the
+    // local size, thus, there is only one element read per thread. The
+    // algorithm must be changed if the number of reduce per thread in the
+    // second step is greater than 1. Otherwise, the result will be wrong.
+    const Index localid = itemID.get_local_id(0);
+    auto aInPtr = aI + localid;
+    auto aOutPtr = outAcc;
+    CoeffReturnType *scratchptr = scratch.get_pointer();
+    CoeffReturnType accumulator = *aInPtr;
+
+    scratchptr[localid] = op.finalize(accumulator);
+    for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratchptr[localid + offset], &accumulator);
+        scratchptr[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) *aOutPtr = op.finalize(accumulator);
+  }
+};
+
+// Full reduction first phase. In this version the vectorization is true and the reduction accept
+// any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ).
+template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
+class FullReductionKernelFunctor {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
+                    (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+      OpDef;
+
+  typedef typename OpDef::type Op;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::PacketReturnType PacketReturnType;
+  typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), PacketReturnType,
+                             CoeffReturnType>
+      OutType;
+  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  LocalAccessor scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType final_output;
+  Index rng;
+  Op op;
+
+  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
+                             Index rng_, OpType op_)
+      : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) const {
+    auto output_ptr = final_output;
+    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    Index start = Evaluator::PacketSize * globalid;
+    // vectorizable parts
+    PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
+    for (Index i = start; i < VectorizedRange; i += step) {
+      op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
+    }
+    globalid += VectorizedRange;
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.template reducePacket<PacketReturnType>(
+          ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
+              evaluator.impl().coeff(i), op.initialize()),
+          &packetAccumulator);
+    }
+    scratch[localid] = packetAccumulator =
+        OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
+    // reduction parts // Local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
+        scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] =
+          op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
+    }
+  }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) const {
+    auto output_ptr = final_output;
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    // vectorizable parts
+    CoeffReturnType accumulator = op.initialize();
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.reduce(evaluator.impl().coeff(i), &accumulator);
+    }
+    scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
+
+    // reduction parts. the local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratch[localid + offset], &accumulator);
+        scratch[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename Evaluator, typename OpType>
+class GenericNondeterministicReducer {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  template <typename Scratch>
+  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
+                                 Index range_, Index num_values_to_reduce_)
+      : evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        functor(OpDef::get_op(functor_)),
+        range(range_),
+        num_values_to_reduce(num_values_to_reduce_) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) const {
+    // This is to bypass the statefull condition in Eigen meanReducer
+    Op non_const_functor;
+    std::memcpy(&non_const_functor, &functor, sizeof(Op));
+    auto output_accessor_ptr = output_accessor;
+    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
+    if (globalid < range) {
+      CoeffReturnType accum = functor.initialize();
+      Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
+          evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
+      output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
+    }
+  }
+
+ private:
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op functor;
+  Index range;
+  Index num_values_to_reduce;
+};
+
+enum class reduction_dim { inner_most, outer_most };
+// default is preserver
+template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
+struct PartialReductionKernel {
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAcc;
+  ScratchAcc scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op op;
+  const Index preserve_elements_num_groups;
+  const Index reduce_elements_num_groups;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
+                         const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
+                         const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
+      : scratch(scratch_),
+        evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        preserve_elements_num_groups(preserve_elements_num_groups_),
+        reduce_elements_num_groups(reduce_elements_num_groups_),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
+                                                                 CoeffReturnType &accumulator) const {
+    if (globalPId >= num_coeffs_to_preserve) {
+      return;
+    }
+    Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
+                                                          : globalRId + (globalPId * num_coeffs_to_reduce);
+    Index localOffset = globalRId;
+
+    const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
+    const Index per_thread_global_stride =
+        rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
+    for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
+      op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
+      localOffset += per_thread_local_stride;
+      global_offset += per_thread_global_stride;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    const Index linearLocalThreadId = itemID.get_local_id(0);
+    Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
+    Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
+    const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
+                                                           : itemID.get_group(0) / reduce_elements_num_groups;
+    const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
+                                                           : itemID.get_group(0) % reduce_elements_num_groups;
+
+    Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
+    CoeffReturnType *scratchPtr = scratch.get_pointer();
+    auto outPtr = output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
+    CoeffReturnType accumulator = op.initialize();
+
+    element_wise_reduce(globalRId, globalPId, accumulator);
+
+    accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
+    scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
+        accumulator;
+    if (rt == reduction_dim::inner_most) {
+      pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
+      rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
+      globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    }
+
+    /* Apply the reduction operation between the current local
+     * id and the one on the other half of the vector. */
+    auto out_scratch_ptr =
+        scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    if (rt == reduction_dim::inner_most) {
+      accumulator = *out_scratch_ptr;
+    }
+    // The Local LocalThreadSizeR is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
+      if (rLocalThreadId < offset) {
+        op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
+        // The result has already been divided for mean reducer in the
+        // previous reduction so no need to divide furthermore
+        *out_scratch_ptr = op.finalize(accumulator);
+      }
+      /* All threads collectively read from global memory into local.
+       * The barrier ensures all threads' IO is resolved before
+       * execution continues (strictly speaking, all threads within
+       * a single work-group - there is no co-ordination between
+       * work-groups, only work-items). */
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+
+    if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
+      outPtr[globalPId] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
+struct SecondStepPartialReduction {
+  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAccessor;
+  InputAccessor input_accessor;
+  OutputAccessor output_accessor;
+  Op op;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
+                                                                   OutputAccessor output_accessor_, OpType op_,
+                                                                   const Index num_coeffs_to_preserve_,
+                                                                   const Index num_coeffs_to_reduce_)
+      : input_accessor(input_accessor_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    const Index globalId = itemID.get_global_id(0);
+
+    if (globalId >= num_coeffs_to_preserve) return;
+
+    auto in_ptr = input_accessor + globalId;
+
+    OutScalar accumulator = op.initialize();
+    // num_coeffs_to_reduce is not bigger that 256
+    for (Index i = 0; i < num_coeffs_to_reduce; i++) {
+      op.reduce(*in_ptr, &accumulator);
+      in_ptr += num_coeffs_to_preserve;
+    }
+    output_accessor[globalId] = op.finalize(accumulator);
+  }
+};  // namespace internal
+
+template <typename Index, Index LTP, Index LTR, bool BC_>
+struct ReductionPannel {
+  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
+  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
+  static EIGEN_CONSTEXPR bool BC = BC_;
+};
+
+template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
+struct PartialReducerLauncher {
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::Storage Storage;
+  typedef typename Self::Index Index;
+  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
+      PannelParameters;
+
+  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
+                  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
+    Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
+
+    // getPowerOfTwo makes sure local range is power of 2 and <=
+    // maxSyclThreadPerBlock this will help us to avoid extra check on the
+    // kernel
+    static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
+                    (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+
+    EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+    // In this step, we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 64
+    // elements individually, we get better performance. However, this can change
+    // on different platforms. In this step we force the code not to be
+    // morthan step reduction: Our empirical research shows that for inner_most
+    // dim reducer, it is better to have 8 group in a reduce dimension for sizes
+    // > 1024 to achieve the best performance.
+    const Index reductionPerThread = 64;
+    Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
+    const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
+    Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
+    const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
+    const Index globalRange = pNumGroups * rNumGroups * localRange;
+
+    EIGEN_CONSTEXPR Index scratchSize =
+        PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (rNumGroups > 1) {
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+             self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+             num_coeffs_to_reduce)
+          .wait();
+      typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
+          SecondStepPartialReductionKernel;
+      dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
+             temp_accessor, output,
+             cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
+             Index(1), reducer, num_coeffs_to_preserve, rNumGroups)
+          .wait();
+      self.device().deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+             self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+             num_coeffs_to_reduce)
+          .wait();
+    }
+    return false;
+  }
+};
+}  // namespace internal
+}  // namespace TensorSycl
+
+namespace internal {
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
+    typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
+    static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                    (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+    EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    typename Self::Index inputSize = self.impl().dimensions().TotalSize();
+    // In this step we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 512
+    // elements individually, we get better performance.
+    const Index reductionPerThread = 2048;
+    // const Index num_work_group =
+    Index reductionGroup = dev.getPowerOfTwo(
+        (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
+    const Index num_work_group = std::min(reductionGroup, local_range);
+    // 1
+    // ? local_range
+    // : 1);
+    const Index global_range = num_work_group * local_range;
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
+                                                                      local_range, inputSize, reducer)
+          .wait();
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, Index, local_range>
+          GenericRKernel;
+      dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+             tmp_global_accessor, data,
+             cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)),
+             num_work_group, reducer)
+          .wait();
+      dev.deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
+                                                                      reducer)
+          .wait();
+    }
+  }
+};
+// vectorizable inner_most most dim preserver
+// col reduction
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
+  }
+};
+// row reduction
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
+  }
+};
+
+// ArmgMax uses this kernel for partial reduction//
+// TODO(@mehdi.goli) come up with a better kernel
+// generic partial reduction
+template <typename Self, typename Op>
+struct GenericReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    typename Self::Index range, GRange, tileSize;
+    dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
+
+    dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
+                                       TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
+           self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+           reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1))
+        .wait();
+    return false;
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
new file mode 100644
index 00000000000..7061f512093
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -0,0 +1,317 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Dimensions, typename Scalar>
+class TensorLazyBaseEvaluator {
+ public:
+  TensorLazyBaseEvaluator() : m_refcount(0) {}
+  virtual ~TensorLazyBaseEvaluator() {}
+
+  EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0;
+  EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0;
+
+  EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0;
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0;
+
+  void incrRefCount() { ++m_refcount; }
+  void decrRefCount() { --m_refcount; }
+  int refCount() const { return m_refcount; }
+
+ private:
+  // No copy, no assignment;
+  TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
+  TensorLazyBaseEvaluator& operator=(const TensorLazyBaseEvaluator& other);
+
+  int m_refcount;
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorReadOnly
+    : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
+ public:
+  //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef TensorEvaluator<Expr, Device> EvalType;
+
+  TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
+    m_dims = m_impl.dimensions();
+    m_impl.evalSubExprsIfNeeded(NULL);
+  }
+  virtual ~TensorLazyEvaluatorReadOnly() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const { return m_dims; }
+  EIGEN_DEVICE_FUNC virtual const Scalar* data() const { return m_impl.data(); }
+
+  EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const { return m_impl.coeff(index); }
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) {
+    eigen_assert(false && "can't reference the coefficient of a rvalue");
+    return m_dummy;
+  };
+
+ protected:
+  TensorEvaluator<Expr, Device> m_impl;
+  Dimensions m_dims;
+  Scalar m_dummy;
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
+ public:
+  typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
+  typedef typename Base::Scalar Scalar;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {}
+  virtual ~TensorLazyEvaluatorWritable() {}
+
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) { return this->m_impl.coeffRef(index); }
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluator : public std::conditional_t<bool(internal::is_lvalue<Expr>::value),
+                                                      TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+                                                      TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> > {
+ public:
+  typedef std::conditional_t<bool(internal::is_lvalue<Expr>::value),
+                             TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+                             TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >
+      Base;
+  typedef typename Base::Scalar Scalar;
+
+  TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {}
+  virtual ~TensorLazyEvaluator() {}
+};
+
+}  // namespace internal
+
+/** \class TensorRef
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A reference to a tensor expression
+ * The expression will be evaluated lazily (as much as possible).
+ *
+ */
+template <typename PlainObjectType>
+class TensorRef : public TensorBase<TensorRef<PlainObjectType> > {
+ public:
+  typedef TensorRef<PlainObjectType> Self;
+  typedef typename PlainObjectType::Base Base;
+  typedef typename Eigen::internal::nested<Self>::type Nested;
+  typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+  typedef typename internal::traits<PlainObjectType>::Index Index;
+  typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename Base::CoeffReturnType CoeffReturnType;
+  typedef Scalar* PointerType;
+  typedef PointerType PointerArgType;
+
+  static constexpr Index NumIndices = PlainObjectType::NumIndices;
+  typedef typename PlainObjectType::Dimensions Dimensions;
+
+  static constexpr int Layout = PlainObjectType::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {}
+
+  template <typename Expression>
+  EIGEN_STRONG_INLINE TensorRef(const Expression& expr)
+      : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
+    m_evaluator->incrRefCount();
+  }
+
+  template <typename Expression>
+  EIGEN_STRONG_INLINE TensorRef& operator=(const Expression& expr) {
+    unrefEvaluator();
+    m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice());
+    m_evaluator->incrRefCount();
+    return *this;
+  }
+
+  ~TensorRef() { unrefEvaluator(); }
+
+  TensorRef(const TensorRef& other) : TensorBase<TensorRef<PlainObjectType> >(other), m_evaluator(other.m_evaluator) {
+    eigen_assert(m_evaluator->refCount() > 0);
+    m_evaluator->incrRefCount();
+  }
+
+  TensorRef& operator=(const TensorRef& other) {
+    if (this != &other) {
+      unrefEvaluator();
+      m_evaluator = other.m_evaluator;
+      eigen_assert(m_evaluator->refCount() > 0);
+      m_evaluator->incrRefCount();
+    }
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(Index index) const { return m_evaluator->coeff(index); }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const {
+    const std::size_t num_indices = (sizeof...(otherIndices) + 1);
+    const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
+    return coeff(indices);
+  }
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) {
+    const std::size_t num_indices = (sizeof...(otherIndices) + 1);
+    const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
+    return coeffRef(indices);
+  }
+
+  template <std::size_t NumIndices>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const {
+    const Dimensions& dims = this->dimensions();
+    Index index = 0;
+    if (PlainObjectType::Options & RowMajor) {
+      index += indices[0];
+      for (size_t i = 1; i < NumIndices; ++i) {
+        index = index * dims[i] + indices[i];
+      }
+    } else {
+      index += indices[NumIndices - 1];
+      for (int i = NumIndices - 2; i >= 0; --i) {
+        index = index * dims[i] + indices[i];
+      }
+    }
+    return m_evaluator->coeff(index);
+  }
+  template <std::size_t NumIndices>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) {
+    const Dimensions& dims = this->dimensions();
+    Index index = 0;
+    if (PlainObjectType::Options & RowMajor) {
+      index += indices[0];
+      for (size_t i = 1; i < NumIndices; ++i) {
+        index = index * dims[i] + indices[i];
+      }
+    } else {
+      index += indices[NumIndices - 1];
+      for (int i = NumIndices - 2; i >= 0; --i) {
+        index = index * dims[i] + indices[i];
+      }
+    }
+    return m_evaluator->coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const { return m_evaluator->coeff(index); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_evaluator->coeffRef(index); }
+
+ private:
+  EIGEN_STRONG_INLINE void unrefEvaluator() {
+    if (m_evaluator) {
+      m_evaluator->decrRefCount();
+      if (m_evaluator->refCount() == 0) {
+        delete m_evaluator;
+      }
+    }
+  }
+
+  internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
+};
+
+// evaluator for rvalues
+template <typename Derived, typename Device>
+struct TensorEvaluator<const TensorRef<Derived>, Device> {
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorRef<Derived>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&) : m_ref(m) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
+
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_ref.coeff(index); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_ref.coeffRef(index); }
+
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); }
+
+ protected:
+  TensorRef<Derived> m_ref;
+};
+
+// evaluator for lvalues
+template <typename Derived, typename Device>
+struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device> {
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
+
+  enum { IsAligned = false, PacketAccess = false, BlockAccess = false, PreferBlockAccess = false, RawAccess = false };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return this->m_ref.coeffRef(index); }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_REF_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
new file mode 100644
index 00000000000..a81a2babfda
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -0,0 +1,410 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorReverse
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reverse elements class.
+ *
+ */
+namespace internal {
+template <typename ReverseDimensions, typename XprType>
+struct traits<TensorReverseOp<ReverseDimensions, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename ReverseDimensions, typename XprType>
+struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense> {
+  typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
+};
+
+template <typename ReverseDimensions, typename XprType>
+struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
+              typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type> {
+  typedef TensorReverseOp<ReverseDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename ReverseDimensions, typename XprType>
+class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors> {
+ public:
+  typedef TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors> Base;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr, const ReverseDimensions& reverse_dims)
+      : m_xpr(expr), m_reverse_dims(reverse_dims) {}
+
+  EIGEN_DEVICE_FUNC const ReverseDimensions& reverse() const { return m_reverse_dims; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const ReverseDimensions m_reverse_dims;
+};
+
+// Eval as rvalue
+template <typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device> {
+  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<ReverseDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = NumDims > 0,
+    PreferBlockAccess = true,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reverse(op.reverse()), m_device(device) {
+    // Reversing a scalar isn't supported yet. It would be a no-op anyway.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Compute strides
+    m_dimensions = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        Index idx = index / m_fastStrides[i];
+        index -= idx * m_strides[i];
+        if (m_reverse[i]) {
+          idx = m_dimensions[i] - idx - 1;
+        }
+        inputIndex += idx * m_strides[i];
+      }
+      if (m_reverse[0]) {
+        inputIndex += (m_dimensions[0] - index - 1);
+      } else {
+        inputIndex += index;
+      }
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        Index idx = index / m_fastStrides[i];
+        index -= idx * m_strides[i];
+        if (m_reverse[i]) {
+          idx = m_dimensions[i] - idx - 1;
+        }
+        inputIndex += idx * m_strides[i];
+      }
+      if (m_reverse[NumDims - 1]) {
+        inputIndex += (m_dimensions[NumDims - 1] - index - 1);
+      } else {
+        inputIndex += index;
+      }
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_impl.coeff(reverseIndex(index));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    // TODO(ndjaitly): write a better packing routine that uses
+    // local structure.
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    // Block evaluation reads underlying memory in reverse order, and default
+    // cost model does not properly catch this in bytes stored/loaded.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size).addCostPerCoeff({0, 0, 24});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    // TODO(ezhulenev): If underlying tensor expression supports and prefers
+    // block evaluation we must use it. Currently we use coeff and packet
+    // access into the underlying tensor expression.
+    // static const bool useBlockAccessForArgType =
+    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
+    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
+
+    static const bool isColMajor = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
+    const bool inner_dim_reversed = m_reverse[inner_dim_idx];
+
+    // Offset in the output block.
+    Index block_offset = 0;
+
+    // Offset in the input Tensor.
+    Index input_offset = reverseIndex(desc.offset());
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = isColMajor ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].count = 0;
+      it[i].reverse = m_reverse[dim];
+
+      it[i].block_stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+
+      it[i].input_stride = m_strides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      if (it[i].reverse) {
+        it[i].input_stride = -1 * it[i].input_stride;
+        it[i].input_span = -1 * it[i].input_span;
+      }
+    }
+
+    // If multiple inner dimensions have the same reverse flag, check if we can
+    // merge them into a single virtual inner dimension.
+    int effective_inner_dim = 0;
+    for (int i = 1; i < NumDims; ++i) {
+      if (it[i].reverse != it[effective_inner_dim].reverse) break;
+      if (it[i].block_stride != it[effective_inner_dim].size) break;
+      if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
+
+      it[i].size = it[effective_inner_dim].size * it[i].size;
+
+      it[i].block_stride = 1;
+      it[i].input_stride = (inner_dim_reversed ? -1 : 1);
+
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      effective_inner_dim = i;
+    }
+
+    eigen_assert(it[effective_inner_dim].block_stride == 1);
+    eigen_assert(it[effective_inner_dim].input_stride == (inner_dim_reversed ? -1 : 1));
+
+    const Index inner_dim_size = it[effective_inner_dim].size;
+
+    // Prepare storage for the materialized reverse result.
+    const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      // Copy inner-most dimension data from reversed location in input.
+      Index dst = block_offset;
+      Index src = input_offset;
+
+      // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
+      // worse results in benchmarks than a simple coefficient loop.
+      if (inner_dim_reversed) {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          --src;
+        }
+      } else {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          ++src;
+        }
+      }
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if ((NumDims - effective_inner_dim) == 1) break;
+
+      // Update offset.
+      for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          block_offset += it[i].block_stride;
+          input_offset += it[i].input_stride;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        block_offset -= it[i].block_span;
+        input_offset -= it[i].input_span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+                                     TensorOpCost::DivCost<Index>());
+    for (int i = 0; i < NumDims; ++i) {
+      if (m_reverse[i]) {
+        compute_cost += 2 * TensorOpCost::AddCost<Index>();
+      }
+    }
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fastStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  ReverseDimensions m_reverse;
+  const Device EIGEN_DEVICE_REF m_device;
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : size(0), count(0), reverse(false), block_stride(0), block_span(0), input_stride(0), input_span(0) {}
+
+    Index size;
+    Index count;
+    bool reverse;
+    Index block_stride;
+    Index block_span;
+    Index input_stride;
+    Index input_span;
+  };
+};
+
+// Eval as lvalue
+
+template <typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
+    : public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device> Base;
+  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<ReverseDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(this->reverseIndex(index));
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    // This code is pilfered from TensorMorphing.h
+    EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      this->coeffRef(index + i) = values[i];
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h
new file mode 100644
index 00000000000..d5b203ab619
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h
@@ -0,0 +1,361 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Tobias Wood tobias@spinicist.org.uk
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ROLL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ROLL_H
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorRoll
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor roll (circular shift) elements class.
+ *
+ */
+namespace internal {
+template <typename RollDimensions, typename XprType>
+struct traits<TensorRollOp<RollDimensions, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename RollDimensions, typename XprType>
+struct eval<TensorRollOp<RollDimensions, XprType>, Eigen::Dense> {
+  typedef const TensorRollOp<RollDimensions, XprType>& type;
+};
+
+template <typename RollDimensions, typename XprType>
+struct nested<TensorRollOp<RollDimensions, XprType>, 1, typename eval<TensorRollOp<RollDimensions, XprType> >::type> {
+  typedef TensorRollOp<RollDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename RollDimensions, typename XprType>
+class TensorRollOp : public TensorBase<TensorRollOp<RollDimensions, XprType>, WriteAccessors> {
+ public:
+  typedef TensorBase<TensorRollOp<RollDimensions, XprType>, WriteAccessors> Base;
+  typedef typename Eigen::internal::traits<TensorRollOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorRollOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorRollOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorRollOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorRollOp(const XprType& expr, const RollDimensions& roll_dims)
+      : m_xpr(expr), m_roll_dims(roll_dims) {}
+
+  EIGEN_DEVICE_FUNC const RollDimensions& roll() const { return m_roll_dims; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorRollOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const RollDimensions m_roll_dims;
+};
+
+// Eval as rvalue
+template <typename RollDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorRollOp<RollDimensions, ArgType>, Device> {
+  typedef TensorRollOp<RollDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<RollDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = NumDims > 0,
+    PreferBlockAccess = true,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  using TensorBlockDesc = internal::TensorBlockDescriptor<NumDims, Index>;
+  using TensorBlockScratch = internal::TensorBlockScratchAllocator<Device>;
+  using ArgTensorBlock = typename TensorEvaluator<const ArgType, Device>::TensorBlock;
+  using TensorBlock = typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, Layout, Index>;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_rolls(op.roll()), m_device(device) {
+    EIGEN_STATIC_ASSERT((NumDims > 0), Must_Have_At_Least_One_Dimension_To_Roll);
+
+    // Compute strides
+    m_dimensions = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+        if (m_strides[i] > 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+        if (m_strides[i] > 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(nullptr);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index roll(Index const i, Index const r, Index const n) const {
+    auto const tmp = (i + r) % n;
+    if (tmp < 0) {
+      return tmp + n;
+    } else {
+      return tmp;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, NumDims> rollCoords(array<Index, NumDims> const& coords) const {
+    array<Index, NumDims> rolledCoords;
+    for (int id = 0; id < NumDims; id++) {
+      eigen_assert(coords[id] < m_dimensions[id]);
+      rolledCoords[id] = roll(coords[id], m_rolls[id], m_dimensions[id]);
+    }
+    return rolledCoords;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rollIndex(Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    Index rolledIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        rolledIndex += roll(idx, m_rolls[i], m_dimensions[i]) * m_strides[i];
+      }
+      rolledIndex += roll(index, m_rolls[0], m_dimensions[0]);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        rolledIndex += roll(idx, m_rolls[i], m_dimensions[i]) * m_strides[i];
+      }
+      rolledIndex += roll(index, m_rolls[NumDims - 1], m_dimensions[NumDims - 1]);
+    }
+    return rolledIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_impl.coeff(rollIndex(index));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size).addCostPerCoeff({0, 0, 24});
+  }
+
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    static const bool is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    // Compute spatial coordinates for the first block element.
+    array<Index, NumDims> coords;
+    extract_coordinates(desc.offset(), coords);
+    array<Index, NumDims> initial_coords = coords;
+    Index offset = 0;  // Offset in the output block buffer.
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = is_col_major ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
+      it[i].span = it[i].stride * (it[i].size - 1);
+      it[i].count = 0;
+    }
+    eigen_assert(it[0].stride == 1);
+
+    // Prepare storage for the materialized generator result.
+    const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    static const int inner_dim = is_col_major ? 0 : NumDims - 1;
+    const Index inner_dim_size = it[0].size;
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      Index i = 0;
+      for (; i < inner_dim_size; ++i) {
+        auto const rolled = rollCoords(coords);
+        auto const index = is_col_major ? m_dimensions.IndexOfColMajor(rolled) : m_dimensions.IndexOfRowMajor(rolled);
+        *(block_buffer + offset + i) = m_impl.coeff(index);
+        coords[inner_dim]++;
+      }
+      coords[inner_dim] = initial_coords[inner_dim];
+
+      if (NumDims == 1) break;  // For the 1d tensor we need to generate only one inner-most dimension.
+
+      // Update offset.
+      for (i = 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          offset += it[i].stride;
+          coords[is_col_major ? i : NumDims - 1 - i]++;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        coords[is_col_major ? i : NumDims - 1 - i] = initial_coords[is_col_major ? i : NumDims - 1 - i];
+        offset -= it[i].span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+                                     TensorOpCost::DivCost<Index>());
+    for (int i = 0; i < NumDims; ++i) {
+      compute_cost += 2 * TensorOpCost::AddCost<Index>();
+    }
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return nullptr; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fast_strides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  RollDimensions m_rolls;
+  const Device EIGEN_DEVICE_REF m_device;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[NumDims - 1] = index;
+    }
+  }
+
+ private:
+};
+
+// Eval as lvalue
+
+template <typename RollDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<TensorRollOp<RollDimensions, ArgType>, Device>
+    : public TensorEvaluator<const TensorRollOp<RollDimensions, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorRollOp<RollDimensions, ArgType>, Device> Base;
+  typedef TensorRollOp<RollDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<RollDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,
+    RawAccess = false
+  };
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(this->rollIndex(index));
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      this->coeffRef(index + i) = values[i];
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_ROLL_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
new file mode 100644
index 00000000000..d16d1490c8a
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -0,0 +1,474 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Op, typename XprType>
+struct traits<TensorScanOp<Op, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Op, typename XprType>
+struct eval<TensorScanOp<Op, XprType>, Eigen::Dense> {
+  typedef const TensorScanOp<Op, XprType>& type;
+};
+
+template <typename Op, typename XprType>
+struct nested<TensorScanOp<Op, XprType>, 1, typename eval<TensorScanOp<Op, XprType> >::type> {
+  typedef TensorScanOp<Op, XprType> type;
+};
+}  // end namespace internal
+
+/** \class TensorScan
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor scan class.
+ */
+template <typename Op, typename XprType>
+class TensorScanOp : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(const XprType& expr, const Index& axis, bool exclusive = false,
+                                                     const Op& op = Op())
+      : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const { return m_axis; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& expression() const { return m_expr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const { return m_accumulator; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { return m_exclusive; }
+
+ protected:
+  typename XprType::Nested m_expr;
+  const Index m_axis;
+  const Op m_accumulator;
+  const bool m_exclusive;
+};
+
+namespace internal {
+
+template <typename Self>
+EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset, typename Self::CoeffReturnType* data) {
+  // Compute the scan along the axis, starting at the given offset
+  typename Self::CoeffReturnType accum = self.accumulator().initialize();
+  if (self.stride() == 1) {
+    if (self.exclusive()) {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      }
+    } else {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  } else {
+    if (self.exclusive()) {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        Index curr = offset + idx3 * self.stride();
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      }
+    } else {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        Index curr = offset + idx3 * self.stride();
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+}
+
+template <typename Self>
+EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset, typename Self::CoeffReturnType* data) {
+  using Scalar = typename Self::CoeffReturnType;
+  using Packet = typename Self::PacketReturnType;
+  // Compute the scan along the axis, starting at the calculated offset
+  Packet accum = self.accumulator().template initializePacket<Packet>();
+  if (self.stride() == 1) {
+    if (self.exclusive()) {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+      }
+    } else {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+      }
+    }
+  } else {
+    if (self.exclusive()) {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        const Index curr = offset + idx3 * self.stride();
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+      }
+    } else {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        const Index curr = offset + idx3 * self.stride();
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+      }
+    }
+  }
+}
+
+template <typename Self, bool Vectorize, bool Parallel>
+struct ReduceBlock {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+    for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+      // Calculate the starting offset for the scan
+      Index offset = idx1 + idx2;
+      ReduceScalar(self, offset, data);
+    }
+  }
+};
+
+// Specialization for vectorized reduction.
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    Index idx2 = 0;
+    for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
+      // Calculate the starting offset for the packet scan
+      Index offset = idx1 + idx2;
+      ReducePacket(self, offset, data);
+    }
+    for (; idx2 < self.stride(); idx2++) {
+      // Calculate the starting offset for the scan
+      Index offset = idx1 + idx2;
+      ReduceScalar(self, offset, data);
+    }
+  }
+};
+
+// Single-threaded CPU implementation of scan
+template <typename Self, typename Reducer, typename Device,
+          bool Vectorize = (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
+                            internal::reducer_traits<Reducer, Device>::PacketAccess)>
+struct ScanLauncher {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) const {
+    Index total_size = internal::array_prod(self.dimensions());
+
+    // We fix the index along the scan axis to 0 and perform a
+    // scan per remaining entry. The iteration is split into two nested
+    // loops to avoid an integer division by keeping track of each idx1 and
+    // idx2.
+    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+      ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
+      block_reducer(self, idx1, data);
+    }
+  }
+};
+
+#ifdef EIGEN_USE_THREADS
+
+// Adjust block_size to avoid false sharing of cachelines among
+// threads. Currently set to twice the cache line size on Intel and ARM
+// processors.
+EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
+  EIGEN_CONSTEXPR Index kBlockAlignment = 128;
+  const Index items_per_cacheline = numext::maxi<Index>(1, kBlockAlignment / item_size);
+  return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline);
+}
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    Index num_scalars = self.stride();
+    Index num_packets = 0;
+    if (self.stride() >= PacketSize) {
+      num_packets = self.stride() / PacketSize;
+      self.device().parallelFor(
+          num_packets,
+          TensorOpCost(PacketSize * self.size(), PacketSize * self.size(), 16 * PacketSize * self.size(), true,
+                       PacketSize),
+          // Make the shard size large enough that two neighboring threads
+          // won't write to the same cacheline of `data`.
+          [=](Index blk_size) { return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size); },
+          [&](Index first, Index last) {
+            for (Index packet = first; packet < last; ++packet) {
+              const Index idx2 = packet * PacketSize;
+              ReducePacket(self, idx1 + idx2, data);
+            }
+          });
+      num_scalars -= num_packets * PacketSize;
+    }
+    self.device().parallelFor(
+        num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); },
+        [&](Index first, Index last) {
+          for (Index scalar = first; scalar < last; ++scalar) {
+            const Index idx2 = num_packets * PacketSize + scalar;
+            ReduceScalar(self, idx1 + idx2, data);
+          }
+        });
+  }
+};
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    self.device().parallelFor(
+        self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); },
+        [&](Index first, Index last) {
+          for (Index idx2 = first; idx2 < last; ++idx2) {
+            ReduceScalar(self, idx1 + idx2, data);
+          }
+        });
+  }
+};
+
+// Specialization for multi-threaded execution.
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index inner_block_size = self.stride() * self.size();
+    bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
+
+    if ((parallelize_by_outer_blocks && total_size <= 4096) ||
+        (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
+      ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
+      launcher(self, data);
+      return;
+    }
+
+    if (parallelize_by_outer_blocks) {
+      // Parallelize over outer blocks.
+      const Index num_outer_blocks = total_size / inner_block_size;
+      self.device().parallelFor(
+          num_outer_blocks,
+          TensorOpCost(inner_block_size, inner_block_size, 16 * PacketSize * inner_block_size, Vectorize, PacketSize),
+          [=](Index blk_size) { return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size); },
+          [&](Index first, Index last) {
+            for (Index idx1 = first; idx1 < last; ++idx1) {
+              ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
+              block_reducer(self, idx1 * inner_block_size, data);
+            }
+          });
+    } else {
+      // Parallelize over inner packets/scalars dimensions when the reduction
+      // axis is not an inner dimension.
+      ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
+      for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+        block_reducer(self, idx1, data);
+      }
+    }
+  }
+};
+#endif  // EIGEN_USE_THREADS
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size,
+                                                        typename Self::CoeffReturnType* data) {
+  // Compute offset as in the CPU version
+  Index val = threadIdx.x + blockIdx.x * blockDim.x;
+  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+  if (offset + (self.size() - 1) * self.stride() < total_size) {
+    // Compute the scan along the axis, starting at the calculated offset
+    typename Self::CoeffReturnType accum = self.accumulator().initialize();
+    for (Index idx = 0; idx < self.size(); idx++) {
+      Index curr = offset + idx * self.stride();
+      if (self.exclusive()) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      } else {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+  __syncthreads();
+}
+
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
+  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+    Index total_size = internal::array_prod(self.dimensions());
+    Index num_blocks = (total_size / self.size() + 63) / 64;
+    Index block_size = 64;
+
+    LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+  }
+};
+#endif  // EIGEN_USE_GPU && (EIGEN_GPUCC)
+
+}  // namespace internal
+
+// Eval as rvalue
+template <typename Op, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
+  typedef TensorScanOp<Op, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef const ArgType ChildTypeNoConst;
+  typedef const ArgType ChildType;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,
+    RawAccess = true
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_device(device),
+        m_exclusive(op.exclusive()),
+        m_accumulator(op.accumulator()),
+        m_size(m_impl.dimensions()[op.axis()]),
+        m_stride(1),
+        m_consume_dim(op.axis()),
+        m_output(NULL) {
+    // Accumulating a scalar isn't supported.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
+
+    // Compute stride of scan axis
+    const Dimensions& dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < op.axis(); ++i) {
+        m_stride = m_stride * dims[i];
+      }
+    } else {
+      // dims can only be indexed through unsigned integers,
+      // so let's use an unsigned type to let the compiler knows.
+      // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized
+      // in this function"
+      unsigned int axis = internal::convert_index<unsigned int>(op.axis());
+      for (unsigned int i = NumDims - 1; i > axis; --i) {
+        m_stride = m_stride * dims[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { return m_stride; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const { return m_consume_dim; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { return m_size; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { return m_accumulator; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { return m_exclusive; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const { return m_impl; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    internal::ScanLauncher<Self, Op, Device> launcher;
+    if (data) {
+      launcher(*this, data);
+      return false;
+    }
+
+    const Index total_size = internal::array_prod(dimensions());
+    m_output =
+        static_cast<EvaluatorPointerType>(m_device.get((Scalar*)m_device.allocate_temp(total_size * sizeof(Scalar))));
+    launcher(*this, m_output);
+    return true;
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_output; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_output[index]; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_output) {
+      m_device.deallocate_temp(m_output);
+      m_output = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
+  const bool m_exclusive;
+  Op m_accumulator;
+  const Index m_size;
+  Index m_stride;
+  Index m_consume_dim;
+  EvaluatorPointerType m_output;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
new file mode 100644
index 00000000000..30fde91970a
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@@ -0,0 +1,506 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorScanSycl.h
+ *
+ * \brief:
+ *  Tensor Scan Sycl implement the extend  version of
+ * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
+ * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
+ * the size of the tensor. In the first kernel (ScanKernelFunctor), each
+ * threads within the work-group individually reduces the allocated elements per
+ * thread in order to reduces the total number of blocks. In the next step all
+ * thread within the work-group will reduce the associated blocks into the
+ * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
+ * buffer is given as an input and all the threads within a work-group scan and
+ * reduces the boundaries between the blocks (generated from the previous
+ * kernel). and write the data on the temporary buffer. If the second kernel is
+ * required, the third and final kernel (ScanAdjustmentKernelFunctor) will
+ * adjust the final result into the output buffer.
+ * The original algorithm for the parallel prefix sum can be found here:
+ *
+ * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
+ * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
+ *1, no. 1 (2008): 1-17.
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
+#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
+#endif
+
+template <typename index_t>
+struct ScanParameters {
+  // must be power of 2
+  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
+  const index_t total_size;
+  const index_t non_scan_size;
+  const index_t scan_size;
+  const index_t non_scan_stride;
+  const index_t scan_stride;
+  const index_t panel_threads;
+  const index_t group_threads;
+  const index_t block_threads;
+  const index_t elements_per_group;
+  const index_t elements_per_block;
+  const index_t loop_range;
+
+  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
+                 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
+                 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
+      : total_size(total_size_),
+        non_scan_size(non_scan_size_),
+        scan_size(scan_size_),
+        non_scan_stride(non_scan_stride_),
+        scan_stride(scan_stride_),
+        panel_threads(panel_threads_),
+        group_threads(group_threads_),
+        block_threads(block_threads_),
+        elements_per_group(elements_per_group_),
+        elements_per_block(elements_per_block_),
+        loop_range(loop_range_) {}
+};
+
+enum class scan_step { first, second };
+template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
+          scan_step stp>
+struct ScanKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+
+  LocalAccessor scratch;
+  Evaluator dev_eval;
+  OutAccessor out_ptr;
+  OutAccessor tmp_ptr;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  const bool inclusive;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
+                                                          OutAccessor out_accessor_, OutAccessor temp_accessor_,
+                                                          const ScanParameters<Index> scanParameters_, Op accumulator_,
+                                                          const bool inclusive_)
+      : scratch(scratch_),
+        dev_eval(dev_eval_),
+        out_ptr(out_accessor_),
+        tmp_ptr(temp_accessor_),
+        scanParameters(scanParameters_),
+        accumulator(accumulator_),
+        inclusive(inclusive_) {}
+
+  template <scan_step sst = stp, typename Input>
+  std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(
+      const Input &inpt, Index global_id) const {
+    return inpt.coeff(global_id);
+  }
+
+  template <scan_step sst = stp, typename Input>
+  std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(
+      const Input &inpt, Index global_id) const {
+    return inpt[global_id];
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(
+      InclusiveOp inclusive_op) const {
+    inclusive_op();
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(
+      InclusiveOp) const {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+      // we put one element per packet in scratch_mem
+      const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
+      const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
+      CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
+      CoeffReturnType inclusive_scan;
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
+                             (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
+                           (global_id < scanParameters.total_size))
+                              ? read(dev_eval, global_id)
+                              : accumulator.initialize();
+        next_elements += scanParameters.scan_stride;
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
+        }
+      });
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        Index private_offset = 1;
+        // build sum in place up the tree
+        EIGEN_UNROLL_LOOP
+        for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+          private_offset *= 2;
+        }
+        scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
+            private_scan[PacketSize - 1 + packetIndex];
+        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
+        // traverse down tree & build scan
+        EIGEN_UNROLL_LOOP
+        for (Index d = 1; d < PacketSize; d *= 2) {
+          private_offset >>= 1;
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[ai] = private_scan[bi];
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+        }
+      }
+
+      Index offset = 1;
+      // build sum in place up the tree
+      for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch[ai], &accum);
+          accumulator.reduce(scratch[bi], &accum);
+          scratch[bi] = accumulator.finalize(accum);
+        }
+        offset *= 2;
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // next step optimisation
+      if (local_id == 0) {
+        if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
+          const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
+                                    scanParameters.non_scan_size +
+                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
+                                block_id;
+          tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset];
+        }
+        // clear the last element
+        scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
+      }
+      // traverse down tree & build scan
+      for (Index d = 1; d < scratch_stride; d *= 2) {
+        offset >>= 1;
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch[ai], &accum);
+          accumulator.reduce(scratch[bi], &accum);
+          scratch[ai] = scratch[bi];
+          scratch[bi] = accumulator.finalize(accum);
+        }
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        EIGEN_UNROLL_LOOP
+        for (Index i = 0; i < PacketSize; i++) {
+          CoeffReturnType accum = private_scan[packetIndex + i];
+          accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
+          private_scan[packetIndex + i] = accumulator.finalize(accum);
+        }
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
+          private_scan[0] = accumulator.finalize(inclusive_scan);
+        }
+      });
+      next_elements = 0;
+      // right the first set of private param
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
+          out_ptr[global_id] = private_scan[private_id];
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }  // end for loop
+  }
+};
+
+template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
+struct ScanAdjustmentKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+  InAccessor in_ptr;
+  OutAccessor out_ptr;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
+                                                                    OutAccessor out_accessor_,
+                                                                    const ScanParameters<Index> scanParameters_,
+                                                                    Op accumulator_)
+      : in_ptr(in_accessor_), out_ptr(out_accessor_), scanParameters(scanParameters_), accumulator(accumulator_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
+
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
+      const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
+      CoeffReturnType adjust_val = in_ptr[in_id];
+
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          CoeffReturnType accum = adjust_val;
+          accumulator.reduce(out_ptr[global_id], &accum);
+          out_ptr[global_id] = accumulator.finalize(accum);
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }
+  }
+};
+
+template <typename Index>
+struct ScanInfo {
+  const Index &total_size;
+  const Index &scan_size;
+  const Index &panel_size;
+  const Index &non_scan_size;
+  const Index &scan_stride;
+  const Index &non_scan_stride;
+
+  Index max_elements_per_block;
+  Index block_size;
+  Index panel_threads;
+  Index group_threads;
+  Index block_threads;
+  Index elements_per_group;
+  Index elements_per_block;
+  Index loop_range;
+  Index global_range;
+  Index local_range;
+  const Eigen::SyclDevice &dev;
+  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
+                               const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
+                               const Eigen::SyclDevice &dev_)
+      : total_size(total_size_),
+        scan_size(scan_size_),
+        panel_size(panel_size_),
+        non_scan_size(non_scan_size_),
+        scan_stride(scan_stride_),
+        non_scan_stride(non_scan_stride_),
+        dev(dev_) {
+    // must be power of 2
+    local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
+                           Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
+
+    max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
+
+    elements_per_group =
+        dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
+    const Index elements_per_panel = elements_per_group * non_scan_size;
+    elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
+    panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
+    group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
+    block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
+    block_size = elements_per_group / elements_per_block;
+#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
+    const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
+#else
+    const Index max_threads = panel_threads * panel_size;
+#endif
+    global_range = roundUp(max_threads, local_range);
+    loop_range = Index(
+        std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
+  }
+  inline ScanParameters<Index> get_scan_parameter() {
+    return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
+                                 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
+  }
+  inline cl::sycl::nd_range<1> get_thread_range() {
+    return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+  }
+};
+
+template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
+struct SYCLAdjustBlockOffset {
+  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
+                                                           Reducer &accumulator, const Index total_size,
+                                                           const Index scan_size, const Index panel_size,
+                                                           const Index non_scan_size, const Index scan_stride,
+                                                           const Index non_scan_stride, const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+
+    typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
+        AdjustFuctor;
+    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
+                                                                      scan_info.max_elements_per_block,
+                                                                      scan_info.get_scan_parameter(), accumulator)
+        .wait();
+  }
+};
+
+template <typename CoeffReturnType, scan_step stp>
+struct ScanLauncher_impl {
+  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
+  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
+                                             const Index total_size, const Index scan_size, const Index panel_size,
+                                             const Index non_scan_size, const Index scan_stride,
+                                             const Index non_scan_stride, const bool inclusive,
+                                             const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+    const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
+    const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
+    CoeffReturnType *temp_pointer =
+        static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
+    EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+
+    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
+    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
+           in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
+           scan_info.get_scan_parameter(), accumulator, inclusive)
+        .wait();
+
+    if (scan_info.block_size > 1) {
+      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
+          tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
+          non_scan_size, Index(1), scan_info.block_size, false, dev);
+
+      SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
+          tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
+          non_scan_stride, dev);
+    }
+    dev.deallocate_temp(temp_pointer);
+  }
+};
+
+}  // namespace internal
+}  // namespace TensorSycl
+namespace internal {
+template <typename Self, typename Reducer, bool vectorize>
+struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
+  typedef typename Self::Index Index;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::Storage Storage;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  void operator()(Self &self, EvaluatorPointerType data) const {
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index scan_size = self.size();
+    const Index scan_stride = self.stride();
+    // this is the scan op (can be sum or ...)
+    auto accumulator = self.accumulator();
+    auto inclusive = !self.exclusive();
+    auto consume_dim = self.consume_dim();
+    auto dev = self.device();
+
+    auto dims = self.inner().dimensions();
+
+    Index non_scan_size = 1;
+    Index panel_size = 1;
+    if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < consume_dim; i++) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim + 1; i < Self::NumDims; i++) {
+        panel_size *= dims[i];
+      }
+    } else {
+      for (int i = Self::NumDims - 1; i > consume_dim; i--) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim - 1; i >= 0; i--) {
+        panel_size *= dims[i];
+      }
+    }
+    const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
+    auto eval_impl = self.inner();
+    TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
+        eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
+        inclusive, dev);
+  }
+};
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
new file mode 100644
index 00000000000..11ce0fb4e48
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -0,0 +1,415 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorShuffling
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor shuffling class.
+ *
+ *
+ */
+namespace internal {
+template <typename Shuffle, typename XprType>
+struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Shuffle, typename XprType>
+struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense> {
+  typedef const TensorShufflingOp<Shuffle, XprType>& type;
+};
+
+template <typename Shuffle, typename XprType>
+struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type> {
+  typedef TensorShufflingOp<Shuffle, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename Shuffle, typename XprType>
+class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> > {
+ public:
+  typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
+      : m_xpr(expr), m_shuffle(shfl) {}
+
+  EIGEN_DEVICE_FUNC const Shuffle& shufflePermutation() const { return m_shuffle; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Shuffle m_shuffle;
+};
+
+// Eval as rvalue
+template <typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
+  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device), m_impl(op.expression(), device) {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Shuffle& shuffle = op.shufflePermutation();
+    m_is_identity = true;
+    for (int i = 0; i < NumDims; ++i) {
+      m_shuffle[i] = static_cast<int>(shuffle[i]);
+      m_dimensions[i] = input_dims[shuffle[i]];
+      m_inverseShuffle[shuffle[i]] = i;
+      if (m_is_identity && shuffle[i] != i) {
+        m_is_identity = false;
+      }
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_unshuffledInputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+
+      for (int i = 1; i < NumDims; ++i) {
+        m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        m_fastOutputStrides[i] =
+            internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
+      }
+    } else {
+      m_unshuffledInputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        m_fastOutputStrides[i] =
+            internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
+  }
+
+  template <int LoadMode, typename Self, bool ImplPacketAccess>
+  struct PacketLoader {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType Run(const Self& self, Index index) {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = self.coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
+
+  template <int LoadMode, typename Self>
+  struct PacketLoader<LoadMode, Self, true> {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType Run(const Self& self, Index index) {
+      if (self.m_is_identity) {
+        return self.m_impl.template packet<LoadMode>(index);
+      } else {
+        EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+        EIGEN_UNROLL_LOOP
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = self.coeff(index + i);
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  };
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const int inner_dim = Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+    const size_t target_size = m_device.firstLevelCacheSize();
+    const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
+
+    // Shuffled inner dimensions leads to a random memory access, which is not
+    // captured by default cost model bytes loaded/stored. We add this cost
+    // explicitly. The number of cycles picked based on the benchmarks.
+    // TODO(ezhulenev): This number was picked based on a very questionable
+    // benchmarks, add benchmarks that are representative of real workloads.
+    using BlockRequirements = internal::TensorBlockResourceRequirements;
+    if (inner_dim_shuffled) {
+      return BlockRequirements::uniform<Scalar>(target_size).addCostPerCoeff({0, 0, NumDims * 28});
+    } else {
+      return BlockRequirements::skewed<Scalar>(target_size);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool root_of_expr_ast = false) const {
+    eigen_assert(m_impl.data() != NULL);
+
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout> TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
+
+    typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
+    TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
+
+    TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(), block_storage.data());
+
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = m_is_identity
+                                    ? TensorOpCost::AddCost<Index>()
+                                    : NumDims * (2 * TensorOpCost::AddCost<Index>() +
+                                                 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index
+  GetBlockOutputIndex(Index input_index, const DSizes<Index, NumDims>& input_block_strides,
+                      const DSizes<Index, NumDims>& output_block_strides,
+                      const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
+    Index output_index = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = input_index / fast_input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index * output_block_strides[m_inverseShuffle[0]];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = input_index / fast_input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index * output_block_strides[m_inverseShuffle[NumDims - 1]];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      return inputIndex + index * m_inputStrides[0];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      return inputIndex + index * m_inputStrides[NumDims - 1];
+    }
+  }
+
+  Dimensions m_dimensions;
+  bool m_is_identity;
+  array<int, NumDims> m_shuffle;
+  array<Index, NumDims> m_inverseShuffle;  // TODO(ezhulenev): Make it int type.
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_unshuffledInputStrides;
+
+  const Device EIGEN_DEVICE_REF m_device;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+// Eval as lvalue
+template <typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
+    : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
+
+  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode>
+  EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      this->coeffRef(index + i) = values[i];
+    }
+  }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout> TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    const Scalar* block_buffer = block.data();
+
+    // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
+    // expression with coefficient and packet access as `src`.
+    void* mem = NULL;
+    if (block_buffer == NULL) {
+      mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
+      ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
+
+      typedef internal::TensorBlockAssignment<ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(desc.dimensions(), internal::strides<Layout>(desc.dimensions()), buf),
+          block.expr());
+
+      block_buffer = buf;
+    }
+
+    // Read from block.
+    TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()), block_buffer);
+
+    // Write to the output buffer.
+    typename TensorBlockIO::Dimensions output_strides(this->m_unshuffledInputStrides);
+    typename TensorBlockIO::Dimensions output_dimensions;
+    for (int i = 0; i < NumDims; ++i) {
+      output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
+    }
+    TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(), this->srcCoeff(desc.offset()));
+
+    // Reorder dimensions according to the shuffle.
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
+    for (int i = 0; i < NumDims; ++i) {
+      dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
+    }
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    // Deallocate temporary buffer used for the block materialization.
+    if (mem != NULL) this->m_device.deallocate(mem);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
new file mode 100644
index 00000000000..964797613ae
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -0,0 +1,144 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
+#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
+
+#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN
+#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN;
+#else
+#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
+#endif
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorStorage
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Stores the data of a tensor
+ *
+ * This class stores the data of fixed-size, dynamic-size or mixed tensors
+ * in a way as compact as possible.
+ *
+ * \sa Tensor
+ */
+template <typename T, typename Dimensions, int Options>
+class TensorStorage;
+
+// Pure fixed-size storage
+template <typename T, typename FixedDimensions, int Options_>
+class TensorStorage {
+ private:
+  static constexpr std::size_t Size = FixedDimensions::total_size;
+
+  // Allocate an array of size at least one to prevent compiler warnings.
+  static constexpr std::size_t MinSize = max_n_1<Size>::size;
+  EIGEN_ALIGN_MAX T m_data[MinSize];
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStorage() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const FixedDimensions dimensions() const { return FixedDimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return Size; }
+};
+
+// pure dynamic
+template <typename T, typename IndexType, int NumIndices_, int Options_>
+class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> {
+ public:
+  typedef IndexType Index;
+  typedef DSizes<IndexType, NumIndices_> Dimensions;
+  typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
+
+  EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {
+    if (NumIndices_ == 0) {
+      m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(1);
+    }
+  }
+  EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
+      : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size)), m_dimensions(dimensions) {
+    EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
+  }
+
+  template <typename... DenseIndex>
+  EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
+    m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(internal::array_prod(m_dimensions));
+  }
+
+  EIGEN_DEVICE_FUNC TensorStorage(const Self& other)
+      : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(
+            internal::array_prod(other.m_dimensions))),
+        m_dimensions(other.m_dimensions) {
+    internal::smart_copy(other.m_data, other.m_data + internal::array_prod(other.m_dimensions), m_data);
+  }
+  EIGEN_DEVICE_FUNC Self& operator=(const Self& other) {
+    if (this != &other) {
+      Self tmp(other);
+      this->swap(tmp);
+    }
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage() { *this = std::move(other); }
+
+  EIGEN_DEVICE_FUNC Self& operator=(Self&& other) {
+    numext::swap(m_data, other.m_data);
+    numext::swap(m_dimensions, other.m_dimensions);
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC ~TensorStorage() {
+    internal::conditional_aligned_delete_auto<T, (Options_ & DontAlign) == 0>(m_data,
+                                                                              internal::array_prod(m_dimensions));
+  }
+  EIGEN_DEVICE_FUNC void swap(Self& other) {
+    numext::swap(m_data, other.m_data);
+    numext::swap(m_dimensions, other.m_dimensions);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions) {
+    const Index currentSz = internal::array_prod(m_dimensions);
+    if (size != currentSz) {
+      internal::conditional_aligned_delete_auto<T, (Options_ & DontAlign) == 0>(m_data, currentSz);
+      if (size)
+        m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size);
+      else if (NumIndices_ == 0) {
+        m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(1);
+      } else
+        m_data = 0;
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+    }
+    m_dimensions = nbDimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+
+ private:
+  T* m_data;
+  Dimensions m_dimensions;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
new file mode 100644
index 00000000000..2924abba062
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -0,0 +1,316 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorStriding
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor striding class.
+ *
+ *
+ */
+namespace internal {
+template <typename Strides, typename XprType>
+struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Strides, typename XprType>
+struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense> {
+  typedef const TensorStridingOp<Strides, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename Strides, typename XprType>
+struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type> {
+  typedef TensorStridingOp<Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename Strides, typename XprType>
+class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> > {
+ public:
+  typedef TensorBase<TensorStridingOp<Strides, XprType> > Base;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+  EIGEN_DEVICE_FUNC const Strides& strides() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Strides m_dims;
+};
+
+// Eval as rvalue
+template <typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> {
+  typedef TensorStridingOp<Strides, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {
+    m_dimensions = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        m_inputStrides[i - 1] *= op.strides()[i - 1];
+      }
+      m_inputStrides[NumDims - 1] *= op.strides()[NumDims - 1];
+    } else {  // RowMajor
+      m_outputStrides[NumDims - 1] = 1;
+      m_inputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        m_inputStrides[i + 1] *= op.strides()[i + 1];
+      }
+      m_inputStrides[0] *= op.strides()[0];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + PacketSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStrides[i];
+        const Index idx1 = indices[1] / m_outputStrides[i];
+        inputIndices[0] += idx0 * m_inputStrides[i];
+        inputIndices[1] += idx1 * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * m_inputStrides[0];
+      inputIndices[1] += indices[1] * m_inputStrides[0];
+    } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStrides[i];
+        const Index idx1 = indices[1] / m_outputStrides[i];
+        inputIndices[0] += idx0 * m_inputStrides[i];
+        inputIndices[1] += idx1 * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * m_inputStrides[NumDims - 1];
+      inputIndices[1] += indices[1] * m_inputStrides[NumDims - 1];
+    }
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[PacketSize - 1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize - 1; ++i) {
+        values[i] = coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() + TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>()) +
+                          TensorOpCost::MulCost<Index>();
+    if (vectorized) {
+      compute_cost *= 2;  // packet() computes two indices
+    }
+    const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
+    return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
+           // Computation is not vectorized per se, but it is done once per packet.
+           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += index * m_inputStrides[0];
+    } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += index * m_inputStrides[NumDims - 1];
+    }
+    return inputIndex;
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+// Eval as lvalue
+template <typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
+    : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> {
+  typedef TensorStridingOp<Strides, ArgType> XprType;
+  typedef TensorEvaluator<const XprType, Device> Base;
+  //  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  //  typedef DSizes<Index, NumDims> Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < this->dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + PacketSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / this->m_outputStrides[i];
+        const Index idx1 = indices[1] / this->m_outputStrides[i];
+        inputIndices[0] += idx0 * this->m_inputStrides[i];
+        inputIndices[1] += idx1 * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * this->m_inputStrides[0];
+      inputIndices[1] += indices[1] * this->m_inputStrides[0];
+    } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / this->m_outputStrides[i];
+        const Index idx1 = indices[1] / this->m_outputStrides[i];
+        inputIndices[0] += idx0 * this->m_inputStrides[i];
+        inputIndices[1] += idx1 * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * this->m_inputStrides[NumDims - 1];
+      inputIndices[1] += indices[1] * this->m_inputStrides[NumDims - 1];
+    }
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+      this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
+    } else {
+      EIGEN_ALIGN_MAX Scalar values[PacketSize];
+      internal::pstore<Scalar, PacketReturnType>(values, x);
+      this->m_impl.coeffRef(inputIndices[0]) = values[0];
+      this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize - 1];
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize - 1; ++i) {
+        this->coeffRef(index + i) = values[i];
+      }
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
new file mode 100644
index 00000000000..035d94e540f
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -0,0 +1,278 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorTrace
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor Trace class.
+ *
+ *
+ */
+
+namespace internal {
+template <typename Dims, typename XprType>
+struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static constexpr int Layout = XprTraits::Layout;
+};
+
+template <typename Dims, typename XprType>
+struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense> {
+  typedef const TensorTraceOp<Dims, XprType>& type;
+};
+
+template <typename Dims, typename XprType>
+struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type> {
+  typedef TensorTraceOp<Dims, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename Dims, typename XprType>
+class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> > {
+ public:
+  typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename XprType::Nested>& expression() const {
+    return m_xpr;
+  }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Dims m_dims;
+};
+
+// Eval as rvalue
+template <typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> {
+  typedef TensorTraceOp<Dims, ArgType> XprType;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static constexpr int NumReducedDims = internal::array_size<Dims>::value;
+  static constexpr int NumOutputDims = NumInputDims - NumReducedDims;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumOutputDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_traceDim(1), m_device(device) {
+    EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    for (int i = 0; i < NumInputDims; ++i) {
+      m_reduced[i] = false;
+    }
+
+    const Dims& op_dims = op.dims();
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op_dims[i] >= 0);
+      eigen_assert(op_dims[i] < NumInputDims);
+      m_reduced[op_dims[i]] = true;
+    }
+
+    // All the dimensions should be distinct to compute the trace
+    int num_distinct_reduce_dims = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced[i]) {
+        ++num_distinct_reduce_dims;
+      }
+    }
+
+    EIGEN_ONLY_USED_FOR_DEBUG(num_distinct_reduce_dims);
+    eigen_assert(num_distinct_reduce_dims == NumReducedDims);
+
+    // Compute the dimensions of the result.
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    int output_index = 0;
+    int reduced_index = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced[i]) {
+        m_reducedDims[reduced_index] = input_dims[i];
+        if (reduced_index > 0) {
+          // All the trace dimensions must have the same size
+          eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]);
+        }
+        ++reduced_index;
+      } else {
+        m_dimensions[output_index] = input_dims[i];
+        ++output_index;
+      }
+    }
+
+    if (NumReducedDims != 0) {
+      m_traceDim = m_reducedDims[0];
+    }
+
+    // Compute the output strides
+    if (NumOutputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        }
+      } else {
+        m_outputStrides.back() = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        }
+      }
+    }
+
+    // Compute the input strides
+    if (NumInputDims > 0) {
+      array<Index, NumInputDims> input_strides;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        input_strides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
+        }
+      } else {
+        input_strides.back() = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      output_index = 0;
+      reduced_index = 0;
+      for (int i = 0; i < NumInputDims; ++i) {
+        if (m_reduced[i]) {
+          m_reducedStrides[reduced_index] = input_strides[i];
+          ++reduced_index;
+        } else {
+          m_preservedStrides[output_index] = input_strides[i];
+          ++output_index;
+        }
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    // Initialize the result
+    CoeffReturnType result = internal::cast<int, CoeffReturnType>(0);
+    Index index_stride = 0;
+    for (int i = 0; i < NumReducedDims; ++i) {
+      index_stride += m_reducedStrides[i];
+    }
+
+    // If trace is requested along all dimensions, starting index would be 0
+    Index cur_index = 0;
+    if (NumOutputDims != 0) cur_index = firstInput(index);
+    for (Index i = 0; i < m_traceDim; ++i) {
+      result += m_impl.coeff(cur_index);
+      cur_index += index_stride;
+    }
+
+    return result;
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values);
+    return result;
+  }
+
+ protected:
+  // Given the output index, finds the first index in the input tensor used to compute the trace
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[0];
+    } else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[NumOutputDims - 1];
+    }
+    return startInput;
+  }
+
+  Dimensions m_dimensions;
+  TensorEvaluator<ArgType, Device> m_impl;
+  // Initialize the size of the trace dimension
+  Index m_traceDim;
+  const Device EIGEN_DEVICE_REF m_device;
+  array<bool, NumInputDims> m_reduced;
+  array<Index, NumReducedDims> m_reducedDims;
+  array<Index, NumOutputDims> m_outputStrides;
+  array<Index, NumReducedDims> m_reducedStrides;
+  array<Index, NumOutputDims> m_preservedStrides;
+};
+
+}  // End namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
new file mode 100644
index 00000000000..017b4ff6340
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -0,0 +1,231 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <typename Scalar, int Options>
+class compute_tensor_flags {
+  enum {
+    is_dynamic_size_storage = 1,
+
+    is_aligned = (((Options & DontAlign) == 0) && (
+#if EIGEN_MAX_STATIC_ALIGN_BYTES > 0
+                                                      (!is_dynamic_size_storage)
+#else
+                                                      0
+#endif
+                                                      |
+#if EIGEN_MAX_ALIGN_BYTES > 0
+                                                      is_dynamic_size_storage
+#else
+                                                      0
+#endif
+                                                      )),
+    packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0
+  };
+
+ public:
+  enum { ret = packet_access_bit };
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > {
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef IndexType_ Index;
+  static constexpr int NumDimensions = NumIndices_;
+  static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
+  };
+  template <typename T>
+  struct MakePointer {
+    typedef T* Type;
+  };
+  typedef typename MakePointer<Scalar>::Type PointerType;
+};
+
+template <typename Scalar_, typename Dimensions, int Options_, typename IndexType_>
+struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> > {
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef IndexType_ Index;
+  static constexpr int NumDimensions = array_size<Dimensions>::value;
+  static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
+  };
+  template <typename T>
+  struct MakePointer {
+    typedef T* Type;
+  };
+  typedef typename MakePointer<Scalar>::Type PointerType;
+};
+
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> > : public traits<PlainObjectType> {
+  typedef traits<PlainObjectType> BaseTraits;
+  typedef typename BaseTraits::Scalar Scalar;
+  typedef typename BaseTraits::StorageKind StorageKind;
+  typedef typename BaseTraits::Index Index;
+  static constexpr int NumDimensions = BaseTraits::NumDimensions;
+  static constexpr int Layout = BaseTraits::Layout;
+  enum { Options = Options_, Flags = BaseTraits::Flags };
+  template <class T>
+  struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+  typedef typename MakePointer<Scalar>::Type PointerType;
+};
+
+template <typename PlainObjectType>
+struct traits<TensorRef<PlainObjectType> > : public traits<PlainObjectType> {
+  typedef traits<PlainObjectType> BaseTraits;
+  typedef typename BaseTraits::Scalar Scalar;
+  typedef typename BaseTraits::StorageKind StorageKind;
+  typedef typename BaseTraits::Index Index;
+  static constexpr int NumDimensions = BaseTraits::NumDimensions;
+  static constexpr int Layout = BaseTraits::Layout;
+  enum { Options = BaseTraits::Options, Flags = BaseTraits::Flags };
+  typedef typename BaseTraits::PointerType PointerType;
+};
+
+template <typename Scalar_, int NumIndices_, int Options, typename IndexType_>
+struct eval<Tensor<Scalar_, NumIndices_, Options, IndexType_>, Eigen::Dense> {
+  typedef const Tensor<Scalar_, NumIndices_, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options, typename IndexType_>
+struct eval<const Tensor<Scalar_, NumIndices_, Options, IndexType_>, Eigen::Dense> {
+  typedef const Tensor<Scalar_, NumIndices_, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense> {
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense> {
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType, int Options, template <class> class MakePointer>
+struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense> {
+  typedef const TensorMap<PlainObjectType, Options, MakePointer> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType, int Options, template <class> class MakePointer>
+struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense> {
+  typedef const TensorMap<PlainObjectType, Options, MakePointer> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType>
+struct eval<TensorRef<PlainObjectType>, Eigen::Dense> {
+  typedef const TensorRef<PlainObjectType> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType>
+struct eval<const TensorRef<PlainObjectType>, Eigen::Dense> {
+  typedef const TensorRef<PlainObjectType> EIGEN_DEVICE_REF type;
+};
+
+// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
+template <typename T, int n = 1, typename PlainObject = void>
+struct nested {
+  typedef typename ref_selector<T>::type type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > {
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> > {
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> > {
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> > {
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType>
+struct nested<TensorRef<PlainObjectType> > {
+  typedef const TensorRef<PlainObjectType> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType>
+struct nested<const TensorRef<PlainObjectType> > {
+  typedef const TensorRef<PlainObjectType> EIGEN_DEVICE_REF type;
+};
+
+}  // end namespace internal
+
+// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
+// R, B), and convolve it with a set of filters, which can also be presented as
+// a tensor (D, K, K, M), where M is the number of filters, K is the filter
+// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
+// simplicity we assume that we always use square filters (which is usually the
+// case in images), hence the two Ks in the tensor dimension.  It also takes in
+// a few additional parameters:
+// Stride (S): The convolution stride is the offset between locations where we
+//             apply the filters.  A larger stride means that the output will be
+//             spatially smaller.
+// Padding (P): The padding we apply to the input tensor along the R and C
+//              dimensions.  This is usually used to make sure that the spatial
+//              dimensions of the output matches our intention.
+//
+// Two types of padding are often used:
+//   SAME: The pad value is computed so that the output will have size
+//         R/S and C/S.
+//   VALID: no padding is carried out.
+// When we do padding, the padded values at the padded locations are usually
+// zero.
+//
+// The output dimensions for convolution, when given all the parameters above,
+// are as follows:
+// When Padding = SAME: the output size is (B, R', C', M), where
+//   R' = ceil(float(R) / float(S))
+//   C' = ceil(float(C) / float(S))
+// where ceil is the ceiling function.  The input tensor is padded with 0 as
+// needed.  The number of padded rows and columns are computed as:
+//   Pr = ((R' - 1) * S + K - R) / 2
+//   Pc = ((C' - 1) * S + K - C) / 2
+// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
+// This is where SAME comes from - the output has the same size as the input has.
+// When Padding = VALID: the output size is computed as
+//   R' = ceil(float(R - K + 1) / float(S))
+//   C' = ceil(float(C - K + 1) / float(S))
+// and the number of padded rows and columns are computed in the same way as in
+// the SAME case.
+// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
+// Pc=0.
+enum PaddingType { PADDING_VALID = 1, PADDING_SAME = 2 };
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
new file mode 100644
index 00000000000..99e51c57718
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -0,0 +1,229 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
+#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <uint64_t n>
+struct static_val {
+  static const uint64_t value = n;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() {}
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+    EIGEN_UNUSED_VARIABLE(v);
+    eigen_assert(v == n);
+  }
+};
+
+template <typename HIGH = uint64_t, typename LOW = uint64_t>
+struct TensorUInt128 {
+  HIGH high;
+  LOW low;
+
+  template <typename OTHER_HIGH, typename OTHER_LOW>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other)
+      : high(other.high), low(other.low) {
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  template <typename OTHER_HIGH, typename OTHER_LOW>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128& operator=(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) {
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    high = other.high;
+    low = other.low;
+    return *this;
+  }
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) {
+    eigen_assert(
+        (static_cast<std::conditional_t<sizeof(T) == 8, uint64_t, uint32_t>>(x) <= NumTraits<uint64_t>::highest()));
+    eigen_assert(x >= 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(HIGH y, LOW x) : high(y), low(x) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const { return low; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const { return low; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const { return high; }
+};
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator==(const TensorUInt128<HL, LL>& lhs,
+                                                      const TensorUInt128<HR, LR>& rhs) {
+  return (lhs.high == rhs.high) && (lhs.low == rhs.low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator!=(const TensorUInt128<HL, LL>& lhs,
+                                                      const TensorUInt128<HR, LR>& rhs) {
+  return (lhs.high != rhs.high) || (lhs.low != rhs.low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator>=(const TensorUInt128<HL, LL>& lhs,
+                                                      const TensorUInt128<HR, LR>& rhs) {
+  if (lhs.high != rhs.high) {
+    return lhs.high > rhs.high;
+  }
+  return lhs.low >= rhs.low;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator<(const TensorUInt128<HL, LL>& lhs,
+                                                     const TensorUInt128<HR, LR>& rhs) {
+  if (lhs.high != rhs.high) {
+    return lhs.high < rhs.high;
+  }
+  return lhs.low < rhs.low;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128<uint64_t, uint64_t> operator+(const TensorUInt128<HL, LL>& lhs,
+                                                                                  const TensorUInt128<HR, LR>& rhs) {
+  TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
+  if (result.low < rhs.low) {
+    result.high += 1;
+  }
+  return result;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128<uint64_t, uint64_t> operator-(const TensorUInt128<HL, LL>& lhs,
+                                                                                  const TensorUInt128<HR, LR>& rhs) {
+  TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
+  if (result.low > lhs.low) {
+    result.high -= 1;
+  }
+  return result;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorUInt128<uint64_t, uint64_t> operator*(
+    const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) {
+  // Split each 128-bit integer into 4 32-bit integers, and then do the
+  // multiplications by hand as follow:
+  //   lhs      a  b  c  d
+  //   rhs      e  f  g  h
+  //           -----------
+  //           ah bh ch dh
+  //           bg cg dg
+  //           cf df
+  //           de
+  // The result is stored in 2 64bit integers, high and low.
+
+  const uint64_t LOW = 0x00000000FFFFFFFFLL;
+  const uint64_t HIGH = 0xFFFFFFFF00000000LL;
+
+  uint64_t d = lhs.low & LOW;
+  uint64_t c = (lhs.low & HIGH) >> 32LL;
+  uint64_t b = lhs.high & LOW;
+  uint64_t a = (lhs.high & HIGH) >> 32LL;
+
+  uint64_t h = rhs.low & LOW;
+  uint64_t g = (rhs.low & HIGH) >> 32LL;
+  uint64_t f = rhs.high & LOW;
+  uint64_t e = (rhs.high & HIGH) >> 32LL;
+
+  // Compute the low 32 bits of low
+  uint64_t acc = d * h;
+  uint64_t low = acc & LOW;
+  //  Compute the high 32 bits of low. Add a carry every time we wrap around
+  acc >>= 32LL;
+  uint64_t carry = 0;
+  uint64_t acc2 = acc + c * h;
+  if (acc2 < acc) {
+    carry++;
+  }
+  acc = acc2 + d * g;
+  if (acc < acc2) {
+    carry++;
+  }
+  low |= (acc << 32LL);
+
+  // Carry forward the high bits of acc to initiate the computation of the
+  // low 32 bits of high
+  acc2 = (acc >> 32LL) | (carry << 32LL);
+  carry = 0;
+
+  acc = acc2 + b * h;
+  if (acc < acc2) {
+    carry++;
+  }
+  acc2 = acc + c * g;
+  if (acc2 < acc) {
+    carry++;
+  }
+  acc = acc2 + d * f;
+  if (acc < acc2) {
+    carry++;
+  }
+  uint64_t high = acc & LOW;
+
+  // Start to compute the high 32 bits of high.
+  acc2 = (acc >> 32LL) | (carry << 32LL);
+
+  acc = acc2 + a * h;
+  acc2 = acc + b * g;
+  acc = acc2 + c * f;
+  acc2 = acc + d * e;
+  high |= (acc2 << 32LL);
+
+  return TensorUInt128<uint64_t, uint64_t>(high, low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorUInt128<uint64_t, uint64_t> operator/(
+    const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) {
+  if (rhs == TensorUInt128<static_val<0>, static_val<1>>(1)) {
+    return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
+  } else if (lhs < rhs) {
+    return TensorUInt128<uint64_t, uint64_t>(0);
+  } else {
+    // calculate the biggest power of 2 times rhs that's less than or equal to lhs
+    TensorUInt128<uint64_t, uint64_t> power2(1);
+    TensorUInt128<uint64_t, uint64_t> d(rhs);
+    TensorUInt128<uint64_t, uint64_t> tmp(lhs - d);
+    while (lhs >= d) {
+      tmp = tmp - d;
+      d = d + d;
+      power2 = power2 + power2;
+    }
+
+    tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
+    TensorUInt128<uint64_t, uint64_t> result(0);
+    while (power2 != TensorUInt128<static_val<0>, static_val<0>>(0)) {
+      if (tmp >= d) {
+        tmp = tmp - d;
+        result = result + power2;
+      }
+      // Shift right
+      power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63));
+      d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63));
+    }
+
+    return result;
+  }
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
new file mode 100644
index 00000000000..2f0c1354167
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -0,0 +1,619 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorVolumePatch
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Patch extraction specialized for processing of volumetric data.
+ * This assumes that the input has a least 4 dimensions ordered as follows:
+ *  - channels
+ *  - planes
+ *  - rows
+ *  - columns
+ *  - (optional) additional dimensions such as time or batch size.
+ * Calling the volume patch code with patch_planes, patch_rows, and patch_cols
+ * is equivalent to calling the regular patch extraction code with parameters
+ * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional
+ * dimensions.
+ */
+namespace internal {
+
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType> {
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions + 1;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense> {
+  typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type;
+};
+
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1,
+              typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type> {
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type;
+};
+
+}  // end namespace internal
+
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(
+      const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+      DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, DenseIndex in_plane_strides,
+      DenseIndex in_row_strides, DenseIndex in_col_strides, DenseIndex plane_inflate_strides,
+      DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, PaddingType padding_type, Scalar padding_value)
+      : m_xpr(expr),
+        m_patch_planes(patch_planes),
+        m_patch_rows(patch_rows),
+        m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides),
+        m_row_strides(row_strides),
+        m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides),
+        m_in_row_strides(in_row_strides),
+        m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides),
+        m_row_inflate_strides(row_inflate_strides),
+        m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false),
+        m_padding_top_z(0),
+        m_padding_bottom_z(0),
+        m_padding_top(0),
+        m_padding_bottom(0),
+        m_padding_left(0),
+        m_padding_right(0),
+        m_padding_type(padding_type),
+        m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(
+      const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+      DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, DenseIndex in_plane_strides,
+      DenseIndex in_row_strides, DenseIndex in_col_strides, DenseIndex plane_inflate_strides,
+      DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, DenseIndex padding_top_z,
+      DenseIndex padding_bottom_z, DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left,
+      DenseIndex padding_right, Scalar padding_value)
+      : m_xpr(expr),
+        m_patch_planes(patch_planes),
+        m_patch_rows(patch_rows),
+        m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides),
+        m_row_strides(row_strides),
+        m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides),
+        m_in_row_strides(in_row_strides),
+        m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides),
+        m_row_inflate_strides(row_inflate_strides),
+        m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true),
+        m_padding_top_z(padding_top_z),
+        m_padding_bottom_z(padding_bottom_z),
+        m_padding_top(padding_top),
+        m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left),
+        m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID),
+        m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC DenseIndex patch_planes() const { return m_patch_planes; }
+  EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; }
+  EIGEN_DEVICE_FUNC DenseIndex patch_cols() const { return m_patch_cols; }
+  EIGEN_DEVICE_FUNC DenseIndex plane_strides() const { return m_plane_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex row_strides() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex col_strides() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex in_plane_strides() const { return m_in_plane_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex in_row_strides() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex in_col_strides() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+  EIGEN_DEVICE_FUNC bool padding_explicit() const { return m_padding_explicit; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_top_z() const { return m_padding_top_z; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_bottom_z() const { return m_padding_bottom_z; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_top() const { return m_padding_top; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_bottom() const { return m_padding_bottom; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_left() const { return m_padding_left; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_right() const { return m_padding_right; }
+  EIGEN_DEVICE_FUNC PaddingType padding_type() const { return m_padding_type; }
+  EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const DenseIndex m_patch_planes;
+  const DenseIndex m_patch_rows;
+  const DenseIndex m_patch_cols;
+  const DenseIndex m_plane_strides;
+  const DenseIndex m_row_strides;
+  const DenseIndex m_col_strides;
+  const DenseIndex m_in_plane_strides;
+  const DenseIndex m_in_row_strides;
+  const DenseIndex m_in_col_strides;
+  const DenseIndex m_plane_inflate_strides;
+  const DenseIndex m_row_inflate_strides;
+  const DenseIndex m_col_inflate_strides;
+  const bool m_padding_explicit;
+  const DenseIndex m_padding_top_z;
+  const DenseIndex m_padding_bottom_z;
+  const DenseIndex m_padding_top;
+  const DenseIndex m_padding_bottom;
+  const DenseIndex m_padding_left;
+  const DenseIndex m_padding_right;
+  const PaddingType m_padding_type;
+  const Scalar m_padding_value;
+};
+
+// Eval as rvalue
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device> {
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static constexpr int NumDims = NumInputDims + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {
+    EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    m_paddingValue = op.padding_value();
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // Cache a few variables.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputDepth = input_dims[0];
+      m_inputPlanes = input_dims[1];
+      m_inputRows = input_dims[2];
+      m_inputCols = input_dims[3];
+    } else {
+      m_inputDepth = input_dims[NumInputDims - 1];
+      m_inputPlanes = input_dims[NumInputDims - 2];
+      m_inputRows = input_dims[NumInputDims - 3];
+      m_inputCols = input_dims[NumInputDims - 4];
+    }
+
+    m_plane_strides = op.plane_strides();
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // Input strides and effective input/patch size
+    m_in_plane_strides = op.in_plane_strides();
+    m_in_row_strides = op.in_row_strides();
+    m_in_col_strides = op.in_col_strides();
+    m_plane_inflate_strides = op.plane_inflate_strides();
+    m_row_inflate_strides = op.row_inflate_strides();
+    m_col_inflate_strides = op.col_inflate_strides();
+
+    // The "effective" spatial size after inflating data with zeros.
+    m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
+    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+    m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
+    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+    if (op.padding_explicit()) {
+      m_outputPlanes =
+          numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) /
+                       static_cast<float>(m_plane_strides));
+      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) /
+                                  static_cast<float>(m_row_strides));
+      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) /
+                                  static_cast<float>(m_col_strides));
+      m_planePaddingTop = op.padding_top_z();
+      m_rowPaddingTop = op.padding_top();
+      m_colPaddingLeft = op.padding_left();
+    } else {
+      // Computing padding from the type
+      switch (op.padding_type()) {
+        case PADDING_VALID:
+          m_outputPlanes =
+              numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+          m_planePaddingTop = 0;
+          m_rowPaddingTop = 0;
+          m_colPaddingLeft = 0;
+          break;
+        case PADDING_SAME: {
+          m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
+          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+          const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff;
+          const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff;
+          const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff;
+          m_planePaddingTop = dz / 2;
+          m_rowPaddingTop = dy / 2;
+          m_colPaddingLeft = dx / 2;
+          break;
+        }
+        default: {
+          eigen_assert(false && "unexpected padding");
+          return;
+        }
+      }
+    }
+    eigen_assert(m_outputRows > 0);
+    eigen_assert(m_outputCols > 0);
+    eigen_assert(m_outputPlanes > 0);
+
+    // Dimensions for result of extraction.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // ColMajor
+      // 0: depth
+      // 1: patch_planes
+      // 2: patch_rows
+      // 3: patch_cols
+      // 4: number of patches
+      // 5 and beyond: anything else (such as batch).
+      m_dimensions[0] = input_dims[0];
+      m_dimensions[1] = op.patch_planes();
+      m_dimensions[2] = op.patch_rows();
+      m_dimensions[3] = op.patch_cols();
+      m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = 5; i < NumDims; ++i) {
+        m_dimensions[i] = input_dims[i - 1];
+      }
+    } else {
+      // RowMajor
+      // NumDims-1: depth
+      // NumDims-2: patch_planes
+      // NumDims-3: patch_rows
+      // NumDims-4: patch_cols
+      // NumDims-5: number of patches
+      // NumDims-6 and beyond: anything else (such as batch).
+      m_dimensions[NumDims - 1] = input_dims[NumInputDims - 1];
+      m_dimensions[NumDims - 2] = op.patch_planes();
+      m_dimensions[NumDims - 3] = op.patch_rows();
+      m_dimensions[NumDims - 4] = op.patch_cols();
+      m_dimensions[NumDims - 5] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = NumDims - 6; i >= 0; --i) {
+        m_dimensions[i] = input_dims[i];
+      }
+    }
+
+    // Strides for the output tensor.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_rowStride = m_dimensions[1];
+      m_colStride = m_dimensions[2] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
+      m_otherStride = m_patchStride * m_dimensions[4];
+    } else {
+      m_rowStride = m_dimensions[NumDims - 2];
+      m_colStride = m_dimensions[NumDims - 3] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[NumDims - 4] * m_dimensions[NumDims - 1];
+      m_otherStride = m_patchStride * m_dimensions[NumDims - 5];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_planeInputStride = m_inputDepth;
+    m_rowInputStride = m_inputDepth * m_inputPlanes;
+    m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
+    m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
+
+    m_outputPlanesRows = m_outputPlanes * m_outputRows;
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
+    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+    m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
+    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+    m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
+    m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows);
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+    } else {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims - 1]);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+
+    // Spatial offset within the patch. This has to be translated into 3D
+    // coordinates within the patch.
+    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    // Batch, etc.
+    const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
+    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    // Calculate column index in the input original tensor.
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol =
+        (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate row index in the original input tensor.
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow =
+        (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate plane index in the original input tensor.
+    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
+    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
+    const Index origInputPlane =
+        (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
+        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+    const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride +
+                             origInputPlane * m_planeInputStride + otherIndex * m_otherInputStride;
+
+    return m_impl.coeff(inputIndex);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
+        m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index indices[2] = {index, index + PacketSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+    const Index patch3DIndex =
+        (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
+                                colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputCols[0] != inputCols[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffsets[2] = {(patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
+                                 (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
+    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+    // Calculate col indices in the original input tensor.
+    const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
+                                rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+    if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputRows[0] != inputRows[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride,
+                                   patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride};
+    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
+    const Index inputPlanes[2] = {planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
+                                  planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
+
+    if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+      // no padding
+      const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+      const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+      const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride +
+                               m_planeInputStride * inputPlanes[0] + otherIndex * m_otherInputStride;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost =
+        10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() + 8 * TensorOpCost::AddCost<Index>();
+    return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const {
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  // Parameters passed to the constructor.
+  Index m_plane_strides;
+  Index m_row_strides;
+  Index m_col_strides;
+
+  Index m_outputPlanes;
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_planePaddingTop;
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  Index m_in_plane_strides;
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+
+  Index m_plane_inflate_strides;
+  Index m_row_inflate_strides;
+  Index m_col_inflate_strides;
+
+  // Cached input size.
+  Index m_inputDepth;
+  Index m_inputPlanes;
+  Index m_inputRows;
+  Index m_inputCols;
+
+  // Other cached variables.
+  Index m_outputPlanesRows;
+
+  // Effective input/patch post-inflation size.
+  Index m_input_planes_eff;
+  Index m_input_rows_eff;
+  Index m_input_cols_eff;
+  Index m_patch_planes_eff;
+  Index m_patch_rows_eff;
+  Index m_patch_cols_eff;
+
+  // Strides for the output tensor.
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_rowStride;
+  Index m_colStride;
+
+  // Strides for the input tensor.
+  Index m_planeInputStride;
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_otherInputStride;
+
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+  internal::TensorIntDivisor<Index> m_fastRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+  internal::TensorIntDivisor<Index> m_fastInputColsEff;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanes;
+  internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+  Scalar m_paddingValue;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
new file mode 100644
index 00000000000..51c0ad6658a
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
@@ -0,0 +1,296 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+class DynamicSGroup {
+ public:
+  inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) {
+    m_elements.push_back(ge(Generator(0, 0, 0)));
+  }
+  inline DynamicSGroup(const DynamicSGroup& o)
+      : m_numIndices(o.m_numIndices),
+        m_elements(o.m_elements),
+        m_generators(o.m_generators),
+        m_globalFlags(o.m_globalFlags) {}
+  inline DynamicSGroup(DynamicSGroup&& o)
+      : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) {
+    std::swap(m_elements, o.m_elements);
+  }
+  inline DynamicSGroup& operator=(const DynamicSGroup& o) {
+    m_numIndices = o.m_numIndices;
+    m_elements = o.m_elements;
+    m_generators = o.m_generators;
+    m_globalFlags = o.m_globalFlags;
+    return *this;
+  }
+  inline DynamicSGroup& operator=(DynamicSGroup&& o) {
+    m_numIndices = o.m_numIndices;
+    std::swap(m_elements, o.m_elements);
+    m_generators = o.m_generators;
+    m_globalFlags = o.m_globalFlags;
+    return *this;
+  }
+
+  void add(int one, int two, int flags = 0);
+
+  template <typename Gen_>
+  inline void add(Gen_) {
+    add(Gen_::One, Gen_::Two, Gen_::Flags);
+  }
+  inline void addSymmetry(int one, int two) { add(one, two, 0); }
+  inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); }
+  inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); }
+  inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); }
+
+  template <typename Op, typename RV, typename Index, std::size_t N, typename... Args>
+  inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const {
+    eigen_assert(N >= m_numIndices &&
+                 "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    for (std::size_t i = 0; i < size(); i++)
+      initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags,
+                        initial, std::forward<Args>(args)...);
+    return initial;
+  }
+
+  template <typename Op, typename RV, typename Index, typename... Args>
+  inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const {
+    eigen_assert(idx.size() >= m_numIndices &&
+                 "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    for (std::size_t i = 0; i < size(); i++)
+      initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...);
+    return initial;
+  }
+
+  inline int globalFlags() const { return m_globalFlags; }
+  inline std::size_t size() const { return m_elements.size(); }
+
+  template <typename Tensor_, typename... IndexTypes>
+  inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor,
+                                                                                   typename Tensor_::Index firstIndex,
+                                                                                   IndexTypes... otherIndices) const {
+    static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices,
+                  "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+    return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  template <typename Tensor_>
+  inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(
+      Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const {
+    return internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup>(tensor, *this, indices);
+  }
+
+ private:
+  struct GroupElement {
+    std::vector<int> representation;
+    int flags;
+    bool isId() const {
+      for (std::size_t i = 0; i < representation.size(); i++)
+        if (i != (size_t)representation[i]) return false;
+      return true;
+    }
+  };
+  struct Generator {
+    int one;
+    int two;
+    int flags;
+    constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
+  };
+
+  std::size_t m_numIndices;
+  std::vector<GroupElement> m_elements;
+  std::vector<Generator> m_generators;
+  int m_globalFlags;
+
+  template <typename Index, std::size_t N, int... n>
+  inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx,
+                                        internal::numeric_list<int, n...>) const {
+    return std::array<Index, N>{{idx[n >= m_numIndices ? n : m_elements[which].representation[n]]...}};
+  }
+
+  template <typename Index>
+  inline std::vector<Index> h_permute(std::size_t which, std::vector<Index> idx) const {
+    std::vector<Index> result;
+    result.reserve(idx.size());
+    for (auto k : m_elements[which].representation) result.push_back(idx[k]);
+    for (std::size_t i = m_numIndices; i < idx.size(); i++) result.push_back(idx[i]);
+    return result;
+  }
+
+  inline GroupElement ge(Generator const& g) const {
+    GroupElement result;
+    result.representation.reserve(m_numIndices);
+    result.flags = g.flags;
+    for (std::size_t k = 0; k < m_numIndices; k++) {
+      if (k == (std::size_t)g.one)
+        result.representation.push_back(g.two);
+      else if (k == (std::size_t)g.two)
+        result.representation.push_back(g.one);
+      else
+        result.representation.push_back(int(k));
+    }
+    return result;
+  }
+
+  GroupElement mul(GroupElement, GroupElement) const;
+  inline GroupElement mul(Generator g1, GroupElement g2) const { return mul(ge(g1), g2); }
+
+  inline GroupElement mul(GroupElement g1, Generator g2) const { return mul(g1, ge(g2)); }
+
+  inline GroupElement mul(Generator g1, Generator g2) const { return mul(ge(g1), ge(g2)); }
+
+  inline int findElement(GroupElement e) const {
+    for (auto ee : m_elements) {
+      if (ee.representation == e.representation) return ee.flags ^ e.flags;
+    }
+    return -1;
+  }
+
+  void updateGlobalFlags(int flagDiffOfSameGenerator);
+};
+
+// dynamic symmetry group that auto-adds the template parameters in the constructor
+template <typename... Gen>
+class DynamicSGroupFromTemplateArgs : public DynamicSGroup {
+ public:
+  inline DynamicSGroupFromTemplateArgs() : DynamicSGroup() { add_all(internal::type_list<Gen...>()); }
+  inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) {}
+  inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) {}
+  inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(const DynamicSGroupFromTemplateArgs<Gen...>& o) {
+    DynamicSGroup::operator=(o);
+    return *this;
+  }
+  inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(DynamicSGroupFromTemplateArgs<Gen...>&& o) {
+    DynamicSGroup::operator=(o);
+    return *this;
+  }
+
+ private:
+  template <typename Gen1, typename... GenNext>
+  inline void add_all(internal::type_list<Gen1, GenNext...>) {
+    add(Gen1());
+    add_all(internal::type_list<GenNext...>());
+  }
+
+  inline void add_all(internal::type_list<>) {}
+};
+
+inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const {
+  eigen_internal_assert(g1.representation.size() == m_numIndices);
+  eigen_internal_assert(g2.representation.size() == m_numIndices);
+
+  GroupElement result;
+  result.representation.reserve(m_numIndices);
+  for (std::size_t i = 0; i < m_numIndices; i++) {
+    int v = g2.representation[g1.representation[i]];
+    eigen_assert(v >= 0);
+    result.representation.push_back(v);
+  }
+  result.flags = g1.flags ^ g2.flags;
+  return result;
+}
+
+inline void DynamicSGroup::add(int one, int two, int flags) {
+  eigen_assert(one >= 0);
+  eigen_assert(two >= 0);
+  eigen_assert(one != two);
+
+  if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) {
+    std::size_t newNumIndices = (one > two) ? one : two + 1;
+    for (auto& gelem : m_elements) {
+      gelem.representation.reserve(newNumIndices);
+      for (std::size_t i = m_numIndices; i < newNumIndices; i++) gelem.representation.push_back(i);
+    }
+    m_numIndices = newNumIndices;
+  }
+
+  Generator g{one, two, flags};
+  GroupElement e = ge(g);
+
+  /* special case for first generator */
+  if (m_elements.size() == 1) {
+    while (!e.isId()) {
+      m_elements.push_back(e);
+      e = mul(e, g);
+    }
+
+    if (e.flags > 0) updateGlobalFlags(e.flags);
+
+    // only add in case we didn't have identity
+    if (m_elements.size() > 1) m_generators.push_back(g);
+    return;
+  }
+
+  int p = findElement(e);
+  if (p >= 0) {
+    updateGlobalFlags(p);
+    return;
+  }
+
+  std::size_t coset_order = m_elements.size();
+  m_elements.push_back(e);
+  for (std::size_t i = 1; i < coset_order; i++) m_elements.push_back(mul(m_elements[i], e));
+  m_generators.push_back(g);
+
+  std::size_t coset_rep = coset_order;
+  do {
+    for (auto g : m_generators) {
+      e = mul(m_elements[coset_rep], g);
+      p = findElement(e);
+      if (p < 0) {
+        // element not yet in group
+        m_elements.push_back(e);
+        for (std::size_t i = 1; i < coset_order; i++) m_elements.push_back(mul(m_elements[i], e));
+      } else if (p > 0) {
+        updateGlobalFlags(p);
+      }
+    }
+    coset_rep += coset_order;
+  } while (coset_rep < m_elements.size());
+}
+
+inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator) {
+  switch (flagDiffOfSameGenerator) {
+    case 0:
+    default:
+      // nothing happened
+      break;
+    case NegationFlag:
+      // every element is it's own negative => whole tensor is zero
+      m_globalFlags |= GlobalZeroFlag;
+      break;
+    case ConjugationFlag:
+      // every element is it's own conjugate => whole tensor is real
+      m_globalFlags |= GlobalRealFlag;
+      break;
+    case (NegationFlag | ConjugationFlag):
+      // every element is it's own negative conjugate => whole tensor is imaginary
+      m_globalFlags |= GlobalImagFlag;
+      break;
+      /* NOTE:
+       *   since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator
+       *   causes the tensor to be real and the next one to be imaginary, this will
+       *   trivially give the correct result
+       */
+  }
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h
new file mode 100644
index 00000000000..b1b2e14c6e6
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h
@@ -0,0 +1,4 @@
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
+#error \
+    "Please include unsupported/Eigen/CXX11/TensorSymmetry instead of including headers inside the src directory directly."
+#endif
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
new file mode 100644
index 00000000000..3f9bb51e716
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
@@ -0,0 +1,223 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename list>
+struct tensor_static_symgroup_permutate;
+
+template <int... nn>
+struct tensor_static_symgroup_permutate<numeric_list<int, nn...>> {
+  constexpr static std::size_t N = sizeof...(nn);
+
+  template <typename T>
+  constexpr static inline std::array<T, N> run(const std::array<T, N>& indices) {
+    return {{indices[nn]...}};
+  }
+};
+
+template <typename indices_, int flags_>
+struct tensor_static_symgroup_element {
+  typedef indices_ indices;
+  constexpr static int flags = flags_;
+};
+
+template <typename Gen, int N>
+struct tensor_static_symgroup_element_ctor {
+  typedef tensor_static_symgroup_element<typename gen_numeric_list_swapped_pair<int, N, Gen::One, Gen::Two>::type,
+                                         Gen::Flags>
+      type;
+};
+
+template <int N>
+struct tensor_static_symgroup_identity_ctor {
+  typedef tensor_static_symgroup_element<typename gen_numeric_list<int, N>::type, 0> type;
+};
+
+template <typename iib>
+struct tensor_static_symgroup_multiply_helper {
+  template <int... iia>
+  constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
+    return numeric_list<int, get<iia, iib>::value...>();
+  }
+};
+
+template <typename A, typename B>
+struct tensor_static_symgroup_multiply {
+ private:
+  typedef typename A::indices iia;
+  typedef typename B::indices iib;
+  constexpr static int ffa = A::flags;
+  constexpr static int ffb = B::flags;
+
+ public:
+  static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices.");
+
+  typedef tensor_static_symgroup_element<decltype(tensor_static_symgroup_multiply_helper<iib>::helper(iia())),
+                                         ffa ^ ffb>
+      type;
+};
+
+template <typename A, typename B>
+struct tensor_static_symgroup_equality {
+  typedef typename A::indices iia;
+  typedef typename B::indices iib;
+  constexpr static int ffa = A::flags;
+  constexpr static int ffb = B::flags;
+  static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices.");
+
+  constexpr static bool value = is_same<iia, iib>::value;
+
+ private:
+  /* this should be zero if they are identical, or else the tensor
+   * will be forced to be pure real, pure imaginary or even pure zero
+   */
+  constexpr static int flags_cmp_ = ffa ^ ffb;
+
+  /* either they are not equal, then we don't care whether the flags
+   * match, or they are equal, and then we have to check
+   */
+  constexpr static bool is_zero = value && flags_cmp_ == NegationFlag;
+  constexpr static bool is_real = value && flags_cmp_ == ConjugationFlag;
+  constexpr static bool is_imag = value && flags_cmp_ == (NegationFlag | ConjugationFlag);
+
+ public:
+  constexpr static int global_flags =
+      (is_real ? GlobalRealFlag : 0) | (is_imag ? GlobalImagFlag : 0) | (is_zero ? GlobalZeroFlag : 0);
+};
+
+template <std::size_t NumIndices, typename... Gen>
+struct tensor_static_symgroup {
+  typedef StaticSGroup<Gen...> type;
+  constexpr static std::size_t size = type::static_size;
+};
+
+template <typename Index, std::size_t N, int... ii, int... jj>
+constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx,
+                                                                                  internal::numeric_list<int, ii...>,
+                                                                                  internal::numeric_list<int, jj...>) {
+  return {{idx[ii]..., idx[jj]...}};
+}
+
+template <typename Index, int... ii>
+static inline std::vector<Index> tensor_static_symgroup_index_permute(std::vector<Index> idx,
+                                                                      internal::numeric_list<int, ii...>) {
+  std::vector<Index> result{{idx[ii]...}};
+  std::size_t target_size = idx.size();
+  for (std::size_t i = result.size(); i < target_size; i++) result.push_back(idx[i]);
+  return result;
+}
+
+template <typename T>
+struct tensor_static_symgroup_do_apply;
+
+template <typename first, typename... next>
+struct tensor_static_symgroup_do_apply<internal::type_list<first, next...>> {
+  template <typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices,
+            typename... Args>
+  static inline RV run(const std::array<Index, NumIndices>& idx, RV initial, Args&&... args) {
+    static_assert(NumIndices >= SGNumIndices,
+                  "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    typedef typename internal::gen_numeric_list<int, NumIndices - SGNumIndices, SGNumIndices>::type remaining_indices;
+    initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()),
+                      first::flags, initial, std::forward<Args>(args)...);
+    return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(
+        idx, initial, args...);
+  }
+
+  template <typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
+  static inline RV run(const std::vector<Index>& idx, RV initial, Args&&... args) {
+    eigen_assert(idx.size() >= SGNumIndices &&
+                 "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial,
+                      std::forward<Args>(args)...);
+    return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(
+        idx, initial, args...);
+  }
+};
+
+template <EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)>
+struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>> {
+  template <typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices,
+            typename... Args>
+  static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...) {
+    // do nothing
+    return initial;
+  }
+
+  template <typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
+  static inline RV run(const std::vector<Index>&, RV initial, Args&&...) {
+    // do nothing
+    return initial;
+  }
+};
+
+}  // end namespace internal
+
+template <typename... Gen>
+class StaticSGroup {
+  constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
+  typedef internal::group_theory::enumerate_group_elements<
+      internal::tensor_static_symgroup_multiply, internal::tensor_static_symgroup_equality,
+      typename internal::tensor_static_symgroup_identity_ctor<NumIndices>::type,
+      internal::type_list<typename internal::tensor_static_symgroup_element_ctor<Gen, NumIndices>::type...>>
+      group_elements;
+  typedef typename group_elements::type ge;
+
+ public:
+  constexpr inline StaticSGroup() {}
+  constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {}
+  constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {}
+
+  template <typename Op, typename RV, typename Index, std::size_t N, typename... Args>
+  static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) {
+    return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
+  }
+
+  template <typename Op, typename RV, typename Index, typename... Args>
+  static inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) {
+    eigen_assert(idx.size() == NumIndices);
+    return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
+  }
+
+  constexpr static std::size_t static_size = ge::count;
+
+  constexpr static inline std::size_t size() { return ge::count; }
+  constexpr static inline int globalFlags() { return group_elements::global_flags; }
+
+  template <typename Tensor_, typename... IndexTypes>
+  inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(
+      Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const {
+    static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices,
+                  "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+    return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  template <typename Tensor_>
+  inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(
+      Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const {
+    return internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>>(tensor, *this, indices);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
new file mode 100644
index 00000000000..2d3ff466f37
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
@@ -0,0 +1,335 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+enum { NegationFlag = 0x01, ConjugationFlag = 0x02 };
+
+enum { GlobalRealFlag = 0x01, GlobalImagFlag = 0x02, GlobalZeroFlag = 0x03 };
+
+namespace internal {
+
+template <std::size_t NumIndices, typename... Sym>
+struct tensor_symmetry_pre_analysis;
+template <std::size_t NumIndices, typename... Sym>
+struct tensor_static_symgroup;
+template <bool instantiate, std::size_t NumIndices, typename... Sym>
+struct tensor_static_symgroup_if;
+template <typename Tensor_>
+struct tensor_symmetry_calculate_flags;
+template <typename Tensor_>
+struct tensor_symmetry_assign_value;
+template <typename... Sym>
+struct tensor_symmetry_num_indices;
+
+}  // end namespace internal
+
+template <int One_, int Two_>
+struct Symmetry {
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = 0;
+};
+
+template <int One_, int Two_>
+struct AntiSymmetry {
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = NegationFlag;
+};
+
+template <int One_, int Two_>
+struct Hermiticity {
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = ConjugationFlag;
+};
+
+template <int One_, int Two_>
+struct AntiHermiticity {
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = ConjugationFlag | NegationFlag;
+};
+
+/** \class DynamicSGroup
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Dynamic symmetry group
+ *
+ * The %DynamicSGroup class represents a symmetry group that need not be known at
+ * compile time. It is useful if one wants to support arbitrary run-time defineable
+ * symmetries for tensors, but it is also instantiated if a symmetry group is defined
+ * at compile time that would be either too large for the compiler to reasonably
+ * generate (using templates to calculate this at compile time is very inefficient)
+ * or that the compiler could generate the group but that it wouldn't make sense to
+ * unroll the loop for setting coefficients anymore.
+ */
+class DynamicSGroup;
+
+/** \internal
+ *
+ * \class DynamicSGroupFromTemplateArgs
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Dynamic symmetry group, initialized from template arguments
+ *
+ * This class is a child class of DynamicSGroup. It uses the template arguments
+ * specified to initialize itself.
+ */
+template <typename... Gen>
+class DynamicSGroupFromTemplateArgs;
+
+/** \class StaticSGroup
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Static symmetry group
+ *
+ * This class represents a symmetry group that is known and resolved completely
+ * at compile time. Ideally, no run-time penalty is incurred compared to the
+ * manual unrolling of the symmetry.
+ *
+ * <b><i>CAUTION:</i></b>
+ *
+ * Do not use this class directly for large symmetry groups. The compiler
+ * may run into a limit, or segfault or in the very least will take a very,
+ * very, very long time to compile the code. Use the SGroup class instead
+ * if you want a static group. That class contains logic that will
+ * automatically select the DynamicSGroup class instead if the symmetry
+ * group becomes too large. (In that case, unrolling may not even be
+ * beneficial.)
+ */
+template <typename... Gen>
+class StaticSGroup;
+
+/** \class SGroup
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Symmetry group, initialized from template arguments
+ *
+ * This class represents a symmetry group whose generators are already
+ * known at compile time. It may or may not be resolved at compile time,
+ * depending on the estimated size of the group.
+ *
+ * \sa StaticSGroup
+ * \sa DynamicSGroup
+ */
+template <typename... Gen>
+class SGroup : public internal::tensor_symmetry_pre_analysis<internal::tensor_symmetry_num_indices<Gen...>::value,
+                                                             Gen...>::root_type {
+ public:
+  constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
+  typedef typename internal::tensor_symmetry_pre_analysis<NumIndices, Gen...>::root_type Base;
+
+  // make standard constructors + assignment operators public
+  inline SGroup() : Base() {}
+  inline SGroup(const SGroup<Gen...>& other) : Base(other) {}
+  inline SGroup(SGroup<Gen...>&& other) : Base(other) {}
+  inline SGroup<Gen...>& operator=(const SGroup<Gen...>& other) {
+    Base::operator=(other);
+    return *this;
+  }
+  inline SGroup<Gen...>& operator=(SGroup<Gen...>&& other) {
+    Base::operator=(other);
+    return *this;
+  }
+
+  // all else is defined in the base class
+};
+
+namespace internal {
+
+template <typename... Sym>
+struct tensor_symmetry_num_indices {
+  constexpr static std::size_t value = 1;
+};
+
+template <int One_, int Two_, typename... Sym>
+struct tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {
+ private:
+  constexpr static std::size_t One = static_cast<std::size_t>(One_);
+  constexpr static std::size_t Two = static_cast<std::size_t>(Two_);
+  constexpr static std::size_t Three = tensor_symmetry_num_indices<Sym...>::value;
+
+  // don't use std::max, since it's not constexpr until C++14...
+  constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1;
+
+ public:
+  constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three;
+};
+
+template <int One_, int Two_, typename... Sym>
+struct tensor_symmetry_num_indices<AntiSymmetry<One_, Two_>, Sym...>
+    : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+template <int One_, int Two_, typename... Sym>
+struct tensor_symmetry_num_indices<Hermiticity<One_, Two_>, Sym...>
+    : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+template <int One_, int Two_, typename... Sym>
+struct tensor_symmetry_num_indices<AntiHermiticity<One_, Two_>, Sym...>
+    : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+
+/** \internal
+ *
+ * \class tensor_symmetry_pre_analysis
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Pre-select whether to use a static or dynamic symmetry group
+ *
+ * When a symmetry group could in principle be determined at compile time,
+ * this template implements the logic whether to actually do that or whether
+ * to rather defer that to runtime.
+ *
+ * The logic is as follows:
+ * <dl>
+ * <dt><b>No generators (trivial symmetry):</b></dt>
+ * <dd>Use a trivial static group. Ideally, this has no performance impact
+ *     compared to not using symmetry at all. In practice, this might not
+ *     be the case.</dd>
+ * <dt><b>More than 4 generators:</b></dt>
+ * <dd>Calculate the group at run time, it is likely far too large for the
+ *     compiler to be able to properly generate it in a realistic time.</dd>
+ * <dt><b>Up to and including 4 generators:</b></dt>
+ * <dd>Actually enumerate all group elements, but then check how many there
+ *     are. If there are more than 16, it is unlikely that unrolling the
+ *     loop (as is done in the static compile-time case) is sensible, so
+ *     use a dynamic group instead. If there are at most 16 elements, actually
+ *     use that static group. Note that the largest group with 4 generators
+ *     still compiles with reasonable resources.</dd>
+ * </dl>
+ *
+ * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470
+ *       with 16 GiB RAM (all generators non-redundant and the subgroups don't
+ *       factorize):
+ *
+ *          # Generators          -O0 -ggdb               -O2
+ *          -------------------------------------------------------------------
+ *          1                 0.5 s  /   250 MiB     0.45s /   230 MiB
+ *          2                 0.5 s  /   260 MiB     0.5 s /   250 MiB
+ *          3                 0.65s  /   310 MiB     0.62s /   310 MiB
+ *          4                 2.2 s  /   860 MiB     1.7 s /   770 MiB
+ *          5               130   s  / 13000 MiB   120   s / 11000 MiB
+ *
+ * It is clear that everything is still very efficient up to 4 generators, then
+ * the memory and CPU requirements become unreasonable. Thus we only instantiate
+ * the template group theory logic if the number of generators supplied is 4 or
+ * lower, otherwise this will be forced to be done during runtime, where the
+ * algorithm is reasonably fast.
+ */
+template <std::size_t NumIndices>
+struct tensor_symmetry_pre_analysis<NumIndices> {
+  typedef StaticSGroup<> root_type;
+};
+
+template <std::size_t NumIndices, typename Gen_, typename... Gens_>
+struct tensor_symmetry_pre_analysis<NumIndices, Gen_, Gens_...> {
+  constexpr static std::size_t max_static_generators = 4;
+  constexpr static std::size_t max_static_elements = 16;
+  typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper;
+  constexpr static std::size_t possible_size = helper::size;
+
+  typedef std::conditional_t<possible_size == 0 || possible_size >= max_static_elements,
+                             DynamicSGroupFromTemplateArgs<Gen_, Gens_...>, typename helper::type>
+      root_type;
+};
+
+template <bool instantiate, std::size_t NumIndices, typename... Gens>
+struct tensor_static_symgroup_if {
+  constexpr static std::size_t size = 0;
+  typedef void type;
+};
+
+template <std::size_t NumIndices, typename... Gens>
+struct tensor_static_symgroup_if<true, NumIndices, Gens...> : tensor_static_symgroup<NumIndices, Gens...> {};
+
+template <typename Tensor_>
+struct tensor_symmetry_assign_value {
+  typedef typename Tensor_::Index Index;
+  typedef typename Tensor_::Scalar Scalar;
+  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+  static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transformation_flags, int dummy,
+                        Tensor_& tensor, const Scalar& value_) {
+    Scalar value(value_);
+    if (transformation_flags & ConjugationFlag) value = numext::conj(value);
+    if (transformation_flags & NegationFlag) value = -value;
+    tensor.coeffRef(transformed_indices) = value;
+    return dummy;
+  }
+};
+
+template <typename Tensor_>
+struct tensor_symmetry_calculate_flags {
+  typedef typename Tensor_::Index Index;
+  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+  static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transform_flags,
+                        int current_flags, const std::array<Index, NumIndices>& orig_indices) {
+    if (transformed_indices == orig_indices) {
+      if (transform_flags & (ConjugationFlag | NegationFlag))
+        return current_flags | GlobalImagFlag;  // anti-hermitian diagonal
+      else if (transform_flags & ConjugationFlag)
+        return current_flags | GlobalRealFlag;  // hermitian diagonal
+      else if (transform_flags & NegationFlag)
+        return current_flags | GlobalZeroFlag;  // anti-symmetric diagonal
+    }
+    return current_flags;
+  }
+};
+
+template <typename Tensor_, typename Symmetry_, int Flags = 0>
+class tensor_symmetry_value_setter {
+ public:
+  typedef typename Tensor_::Index Index;
+  typedef typename Tensor_::Scalar Scalar;
+  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+  inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry,
+                                      std::array<Index, NumIndices> const& indices)
+      : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) {}
+
+  inline tensor_symmetry_value_setter<Tensor_, Symmetry_, Flags>& operator=(Scalar const& value) {
+    doAssign(value);
+    return *this;
+  }
+
+ private:
+  Tensor_& m_tensor;
+  Symmetry_ m_symmetry;
+  std::array<Index, NumIndices> m_indices;
+
+  inline void doAssign(Scalar const& value) {
+#ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES
+    int value_flags = m_symmetry.template apply<internal::tensor_symmetry_calculate_flags<Tensor_>, int>(
+        m_indices, m_symmetry.globalFlags(), m_indices);
+    if (value_flags & GlobalRealFlag) eigen_assert(numext::imag(value) == 0);
+    if (value_flags & GlobalImagFlag) eigen_assert(numext::real(value) == 0);
+#endif
+    m_symmetry.template apply<internal::tensor_symmetry_assign_value<Tensor_>, int>(m_indices, 0, m_tensor, value);
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
new file mode 100644
index 00000000000..aa16f3cd448
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
@@ -0,0 +1,492 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+namespace group_theory {
+
+/** \internal
+ * \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+ * This file contains C++ templates that implement group theory algorithms.
+ *
+ * The algorithms allow for a compile-time analysis of finite groups.
+ *
+ * Currently only Dimino's algorithm is implemented, which returns a list
+ * of all elements in a group given a set of (possibly redundant) generators.
+ * (One could also do that with the so-called orbital algorithm, but that
+ * is much more expensive and usually has no advantages.)
+ */
+
+/**********************************************************************
+ *                "Ok kid, here is where it gets complicated."
+ *                         - Amelia Pond in the "Doctor Who" episode
+ *                           "The Big Bang"
+ *
+ * Dimino's algorithm
+ * ==================
+ *
+ * The following is Dimino's algorithm in sequential form:
+ *
+ * Input: identity element, list of generators, equality check,
+ *        multiplication operation
+ * Output: list of group elements
+ *
+ * 1. add identity element
+ * 2. remove identities from list of generators
+ * 3. add all powers of first generator that aren't the
+ *    identity element
+ * 4. go through all remaining generators:
+ *        a. if generator is already in the list of elements
+ *                -> do nothing
+ *        b. otherwise
+ *                i.   remember current # of elements
+ *                     (i.e. the size of the current subgroup)
+ *                ii.  add all current elements (which includes
+ *                     the identity) each multiplied from right
+ *                     with the current generator to the group
+ *                iii. add all remaining cosets that are generated
+ *                     by products of the new generator with itself
+ *                     and all other generators seen so far
+ *
+ * In functional form, this is implemented as a long set of recursive
+ * templates that have a complicated relationship.
+ *
+ * The main interface for Dimino's algorithm is the template
+ * enumerate_group_elements. All lists are implemented as variadic
+ * type_list<typename...> and numeric_list<typename = int, int...>
+ * templates.
+ *
+ * 'Calling' templates is usually done via typedefs.
+ *
+ * This algorithm is an extended version of the basic version. The
+ * extension consists in the fact that each group element has a set
+ * of flags associated with it. Multiplication of two group elements
+ * with each other results in a group element whose flags are the
+ * XOR of the flags of the previous elements. Each time the algorithm
+ * notices that a group element it just calculated is already in the
+ * list of current elements, the flags of both will be compared and
+ * added to the so-called 'global flags' of the group.
+ *
+ * The rationale behind this extension is that this allows not only
+ * for the description of symmetries between tensor indices, but
+ * also allows for the description of hermiticity, antisymmetry and
+ * antihermiticity. Negation and conjugation each are specific bit
+ * in the flags value and if two different ways to reach a group
+ * element lead to two different flags, this poses a constraint on
+ * the allowed values of the resulting tensor. For example, if a
+ * group element is reach both with and without the conjugation
+ * flags, it is clear that the resulting tensor has to be real.
+ *
+ * Note that this flag mechanism is quite generic and may have other
+ * uses beyond tensor properties.
+ *
+ * IMPORTANT:
+ *     This algorithm assumes the group to be finite. If you try to
+ *     run it with a group that's infinite, the algorithm will only
+ *     terminate once you hit a compiler limit (max template depth).
+ *     Also note that trying to use this implementation to create a
+ *     very large group will probably either make you hit the same
+ *     limit, cause the compiler to segfault or at the very least
+ *     take a *really* long time (hours, days, weeks - sic!) to
+ *     compile. It is not recommended to plug in more than 4
+ *     generators, unless they are independent of each other.
+ */
+
+/** \internal
+ *
+ * \class strip_identities
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Cleanse a list of group elements of the identity element
+ *
+ * This template is used to make a first pass through all initial
+ * generators of Dimino's algorithm and remove the identity
+ * elements.
+ *
+ * \sa enumerate_group_elements
+ */
+template <template <typename, typename> class Equality, typename id, typename L>
+struct strip_identities;
+
+template <template <typename, typename> class Equality, typename id, typename t, typename... ts>
+struct strip_identities<Equality, id, type_list<t, ts...>> {
+  typedef std::conditional_t<
+      Equality<id, t>::value, typename strip_identities<Equality, id, type_list<ts...>>::type,
+      typename concat<type_list<t>, typename strip_identities<Equality, id, type_list<ts...>>::type>::type>
+      type;
+  constexpr static int global_flags =
+      Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags;
+};
+
+template <template <typename, typename> class Equality, typename id EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts)>
+struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>> {
+  typedef type_list<> type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_first_step_elements_helper
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template that adds powers of the first generator to the list of group elements
+ *
+ * This template calls itself recursively to add powers of the first
+ * generator to the list of group elements. It stops if it reaches
+ * the identity element again.
+ *
+ * \sa enumerate_group_elements, dimino_first_step_elements
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename g, typename current_element, typename elements,
+          bool dont_add_current_element  // = false
+          >
+struct dimino_first_step_elements_helper
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    :  // recursive inheritance is too difficult for Doxygen
+       public dimino_first_step_elements_helper<Multiply, Equality, id, g, typename Multiply<current_element, g>::type,
+                                                typename concat<elements, type_list<current_element>>::type,
+                                                Equality<typename Multiply<current_element, g>::type, id>::value> {
+};
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename g, typename current_element, typename elements>
+struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true>
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+{
+  typedef elements type;
+  constexpr static int global_flags = Equality<current_element, id>::global_flags;
+};
+
+/** \internal
+ *
+ * \class dimino_first_step_elements
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Add all powers of the first generator to the list of group elements
+ *
+ * This template takes the first non-identity generator and generates the initial
+ * list of elements which consists of all powers of that generator. For a group
+ * with just one generated, it would be enumerated after this.
+ *
+ * \sa enumerate_group_elements
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename generators>
+struct dimino_first_step_elements {
+  typedef typename get<0, generators>::type first_generator;
+  typedef typename skip<1, generators>::type next_generators;
+  typedef type_list<first_generator> generators_done;
+
+  typedef dimino_first_step_elements_helper<Multiply, Equality, id, first_generator, first_generator, type_list<id>,
+                                            false>
+      helper;
+  typedef typename helper::type type;
+  constexpr static int global_flags = helper::global_flags;
+};
+
+/** \internal
+ *
+ * \class dimino_get_coset_elements
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Generate all elements of a specific coset
+ *
+ * This template generates all the elements of a specific coset by
+ * multiplying all elements in the given subgroup with the new
+ * coset representative. Note that the first element of the
+ * subgroup is always the identity element, so the first element of
+ * the result of this template is going to be the coset
+ * representative itself.
+ *
+ * Note that this template accepts an additional boolean parameter
+ * that specifies whether to actually generate the coset (true) or
+ * just return an empty list (false).
+ *
+ * \sa enumerate_group_elements, dimino_add_cosets_for_rep
+ */
+template <template <typename, typename> class Multiply, typename sub_group_elements, typename new_coset_rep,
+          bool generate_coset  // = true
+          >
+struct dimino_get_coset_elements {
+  typedef typename apply_op_from_right<Multiply, new_coset_rep, sub_group_elements>::type type;
+};
+
+template <template <typename, typename> class Multiply, typename sub_group_elements, typename new_coset_rep>
+struct dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, false> {
+  typedef type_list<> type;
+};
+
+/** \internal
+ *
+ * \class dimino_add_cosets_for_rep
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template for adding coset spaces
+ *
+ * This template multiplies the coset representative with a generator
+ * from the list of previous generators. If the new element is not in
+ * the group already, it adds the corresponding coset. Finally it
+ * proceeds to call itself with the next generator from the list.
+ *
+ * \sa enumerate_group_elements, dimino_add_all_coset_spaces
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename sub_group_elements, typename elements, typename generators, typename rep_element, int sub_group_size>
+struct dimino_add_cosets_for_rep;
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename sub_group_elements, typename elements, typename g, typename... gs, typename rep_element,
+          int sub_group_size>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<g, gs...>, rep_element,
+                                 sub_group_size> {
+  typedef typename Multiply<rep_element, g>::type new_coset_rep;
+  typedef contained_in_list_gf<Equality, new_coset_rep, elements> _cil;
+  constexpr static bool add_coset = !_cil::value;
+
+  typedef
+      typename dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, add_coset>::type coset_elements;
+
+  typedef dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements,
+                                    typename concat<elements, coset_elements>::type, type_list<gs...>, rep_element,
+                                    sub_group_size>
+      _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _cil::global_flags | _helper::global_flags;
+
+  /* Note that we don't have to update global flags here, since
+   * we will only add these elements if they are not part of
+   * the group already. But that only happens if the coset rep
+   * is not already in the group, so the check for the coset rep
+   * will catch this.
+   */
+};
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename sub_group_elements, typename elements EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
+          typename rep_element, int sub_group_size>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements,
+                                 type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size> {
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_add_all_coset_spaces
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template for adding all coset spaces for a new generator
+ *
+ * This template tries to go through the list of generators (with
+ * the help of the dimino_add_cosets_for_rep template) as long as
+ * it still finds elements that are not part of the group and add
+ * the corresponding cosets.
+ *
+ * \sa enumerate_group_elements, dimino_add_cosets_for_rep
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename sub_group_elements, typename elements, typename generators, int sub_group_size, int rep_pos,
+          bool stop_condition  // = false
+          >
+struct dimino_add_all_coset_spaces {
+  typedef typename get<rep_pos, elements>::type rep_element;
+  typedef dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, generators, rep_element,
+                                    sub_group_elements::count>
+      _ac4r;
+  typedef typename _ac4r::type new_elements;
+
+  constexpr static int new_rep_pos = rep_pos + sub_group_elements::count;
+  constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count;
+
+  typedef dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, new_elements, generators,
+                                      sub_group_size, new_rep_pos, new_stop_condition>
+      _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags;
+};
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename sub_group_elements, typename elements, typename generators, int sub_group_size, int rep_pos>
+struct dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, elements, generators, sub_group_size,
+                                   rep_pos, true> {
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_add_generator
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Enlarge the group by adding a new generator.
+ *
+ * It accepts a boolean parameter that determines if the generator is redundant,
+ * i.e. was already seen in the group. In that case, it reduces to a no-op.
+ *
+ * \sa enumerate_group_elements, dimino_add_all_coset_spaces
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename elements, typename generators_done, typename current_generator,
+          bool redundant  // = false
+          >
+struct dimino_add_generator {
+  /* this template is only called if the generator is not redundant
+   * => all elements of the group multiplied with the new generator
+   *    are going to be new elements of the most trivial coset space
+   */
+  typedef typename apply_op_from_right<Multiply, current_generator, elements>::type multiplied_elements;
+  typedef typename concat<elements, multiplied_elements>::type new_elements;
+
+  constexpr static int rep_pos = elements::count;
+
+  typedef dimino_add_all_coset_spaces<
+      Multiply, Equality, id,
+      elements,  // elements of previous subgroup
+      new_elements, typename concat<generators_done, type_list<current_generator>>::type,
+      elements::count,  // size of previous subgroup
+      rep_pos,
+      false  // don't stop (because rep_pos >= new_elements::count is always false at this point)
+      >
+      _helper;
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _helper::global_flags;
+};
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename elements, typename generators_done, typename current_generator>
+struct dimino_add_generator<Multiply, Equality, id, elements, generators_done, current_generator, true> {
+  // redundant case
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_add_remaining_generators
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template that adds all remaining generators to a group
+ *
+ * Loop through the list of generators that remain and successively
+ * add them to the group.
+ *
+ * \sa enumerate_group_elements, dimino_add_generator
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename generators_done, typename remaining_generators, typename elements>
+struct dimino_add_remaining_generators {
+  typedef typename get<0, remaining_generators>::type first_generator;
+  typedef typename skip<1, remaining_generators>::type next_generators;
+
+  typedef contained_in_list_gf<Equality, first_generator, elements> _cil;
+
+  typedef dimino_add_generator<Multiply, Equality, id, elements, generators_done, first_generator, _cil::value> _helper;
+
+  typedef typename _helper::type new_elements;
+
+  typedef dimino_add_remaining_generators<Multiply, Equality, id,
+                                          typename concat<generators_done, type_list<first_generator>>::type,
+                                          next_generators, new_elements>
+      _next_iter;
+
+  typedef typename _next_iter::type type;
+  constexpr static int global_flags = _cil::global_flags | _helper::global_flags | _next_iter::global_flags;
+};
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename generators_done, typename elements>
+struct dimino_add_remaining_generators<Multiply, Equality, id, generators_done, type_list<>, elements> {
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class enumerate_group_elements_noid
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Helper template that implements group element enumeration
+ *
+ * This is a helper template that implements the actual enumeration
+ * of group elements. This has been split so that the list of
+ * generators can be cleansed of the identity element before
+ * performing the actual operation.
+ *
+ * \sa enumerate_group_elements
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename generators, int initial_global_flags = 0>
+struct enumerate_group_elements_noid {
+  typedef dimino_first_step_elements<Multiply, Equality, id, generators> first_step;
+  typedef typename first_step::type first_step_elements;
+
+  typedef dimino_add_remaining_generators<Multiply, Equality, id, typename first_step::generators_done,
+                                          typename first_step::next_generators,  // remaining_generators
+                                          typename first_step::type              // first_step elements
+                                          >
+      _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags = initial_global_flags | first_step::global_flags | _helper::global_flags;
+};
+
+// in case when no generators are specified
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          int initial_global_flags>
+struct enumerate_group_elements_noid<Multiply, Equality, id, type_list<>, initial_global_flags> {
+  typedef type_list<id> type;
+  constexpr static int global_flags = initial_global_flags;
+};
+
+/** \internal
+ *
+ * \class enumerate_group_elements
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Enumerate all elements in a finite group
+ *
+ * This template enumerates all elements in a finite group. It accepts
+ * the following template parameters:
+ *
+ * \tparam Multiply      The multiplication operation that multiplies two group elements
+ *                       with each other.
+ * \tparam Equality      The equality check operation that checks if two group elements
+ *                       are equal to another.
+ * \tparam id            The identity element
+ * \tparam Generators_   A list of (possibly redundant) generators of the group
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename Generators_>
+struct enumerate_group_elements
+    : public enumerate_group_elements_noid<Multiply, Equality, id,
+                                           typename strip_identities<Equality, id, Generators_>::type,
+                                           strip_identities<Equality, id, Generators_>::global_flags> {};
+
+}  // end namespace group_theory
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/SpecialFunctions b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/SpecialFunctions
new file mode 100644
index 00000000000..4f7e5993549
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/SpecialFunctions
@@ -0,0 +1,104 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_MODULE_H
+#define EIGEN_SPECIALFUNCTIONS_MODULE_H
+
+#include <math.h>
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+ * \defgroup SpecialFunctions_Module Special math functions module
+ *
+ * This module features additional coefficient-wise math functions available
+ * within the numext:: namespace for the scalar version, and as method and/or free
+ * functions of Array. Those include:
+ *
+ * - erf
+ * - erfc
+ * - lgamma
+ * - igamma
+ * - igamma_der_a
+ * - gamma_sample_der_alpha
+ * - igammac
+ * - digamma
+ * - ndtri
+ * - polygamma
+ * - zeta
+ * - betainc
+ *
+ * Bessel Functions
+ * - bessel_i0
+ * - bessel_i0e
+ * - bessel_i1
+ * - bessel_i1e
+ * - bessel_j0
+ * - bessel_j1
+ * - bessel_k0
+ * - bessel_k0e
+ * - bessel_k1
+ * - bessel_k1e
+ * - bessel_y0
+ * - bessel_y1
+ *
+ * \code
+ * #include <unsupported/Eigen/SpecialFunctions>
+ * \endcode
+ */
+//@{
+
+}  // namespace Eigen
+
+// IWYU pragma: begin_exports
+#include "src/SpecialFunctions/BesselFunctionsImpl.h"
+#include "src/SpecialFunctions/BesselFunctionsBFloat16.h"
+#include "src/SpecialFunctions/BesselFunctionsHalf.h"
+#include "src/SpecialFunctions/BesselFunctionsPacketMath.h"
+#include "src/SpecialFunctions/BesselFunctionsFunctors.h"
+#include "src/SpecialFunctions/BesselFunctionsArrayAPI.h"
+#include "src/SpecialFunctions/SpecialFunctionsImpl.h"
+#if defined(EIGEN_HIPCC)
+#include "src/SpecialFunctions/HipVectorCompatibility.h"
+#endif
+#include "src/SpecialFunctions/SpecialFunctionsBFloat16.h"
+#include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
+#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
+
+#if defined EIGEN_VECTORIZE_AVX512
+#include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+#include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+#include "src/SpecialFunctions/arch/AVX512/BesselFunctions.h"
+#include "src/SpecialFunctions/arch/AVX512/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_AVX
+#include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+#include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_NEON
+#include "src/SpecialFunctions/arch/NEON/BesselFunctions.h"
+#include "src/SpecialFunctions/arch/NEON/SpecialFunctions.h"
+#endif
+
+#if defined EIGEN_VECTORIZE_GPU
+#include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h"
+#endif
+// IWYU pragma: end_exports
+
+namespace Eigen {
+//@}
+}
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_SPECIALFUNCTIONS_MODULE_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
new file mode 100644
index 00000000000..1b2ded93dde
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -0,0 +1,284 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Kolja Brix <brix@igpm.rwth-aachen.de>
+// Copyright (C) 2011 Andreas Platen <andiplaten@gmx.de>
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef KRONECKER_TENSOR_PRODUCT_H
+#define KRONECKER_TENSOR_PRODUCT_H
+
+// IWYU pragma: private
+// #include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * \brief The base class of dense and sparse Kronecker product.
+ *
+ * \tparam Derived is the derived type.
+ */
+template <typename Derived>
+class KroneckerProductBase : public ReturnByValue<Derived> {
+ private:
+  typedef typename internal::traits<Derived> Traits;
+  typedef typename Traits::Scalar Scalar;
+
+ protected:
+  typedef typename Traits::Lhs Lhs;
+  typedef typename Traits::Rhs Rhs;
+
+ public:
+  /*! \brief Constructor. */
+  KroneckerProductBase(const Lhs& A, const Rhs& B) : m_A(A), m_B(B) {}
+
+  inline Index rows() const { return m_A.rows() * m_B.rows(); }
+  inline Index cols() const { return m_A.cols() * m_B.cols(); }
+
+  /*!
+   * This overrides ReturnByValue::coeff because this function is
+   * efficient enough.
+   */
+  Scalar coeff(Index row, Index col) const {
+    return m_A.coeff(row / m_B.rows(), col / m_B.cols()) * m_B.coeff(row % m_B.rows(), col % m_B.cols());
+  }
+
+  /*!
+   * This overrides ReturnByValue::coeff because this function is
+   * efficient enough.
+   */
+  Scalar coeff(Index i) const {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    return m_A.coeff(i / m_A.size()) * m_B.coeff(i % m_A.size());
+  }
+
+ protected:
+  typename Lhs::Nested m_A;
+  typename Rhs::Nested m_B;
+};
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * \brief Kronecker tensor product helper class for dense matrices
+ *
+ * This class is the return value of kroneckerProduct(MatrixBase,
+ * MatrixBase). Use the function rather than construct this class
+ * directly to avoid specifying template prarameters.
+ *
+ * \tparam Lhs  Type of the left-hand side, a matrix expression.
+ * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
+ */
+template <typename Lhs, typename Rhs>
+class KroneckerProduct : public KroneckerProductBase<KroneckerProduct<Lhs, Rhs> > {
+ private:
+  typedef KroneckerProductBase<KroneckerProduct> Base;
+  using Base::m_A;
+  using Base::m_B;
+
+ public:
+  /*! \brief Constructor. */
+  KroneckerProduct(const Lhs& A, const Rhs& B) : Base(A, B) {}
+
+  /*! \brief Evaluate the Kronecker tensor product. */
+  template <typename Dest>
+  void evalTo(Dest& dst) const;
+};
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * \brief Kronecker tensor product helper class for sparse matrices
+ *
+ * If at least one of the operands is a sparse matrix expression,
+ * then this class is returned and evaluates into a sparse matrix.
+ *
+ * This class is the return value of kroneckerProduct(EigenBase,
+ * EigenBase). Use the function rather than construct this class
+ * directly to avoid specifying template prarameters.
+ *
+ * \tparam Lhs  Type of the left-hand side, a matrix expression.
+ * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
+ */
+template <typename Lhs, typename Rhs>
+class KroneckerProductSparse : public KroneckerProductBase<KroneckerProductSparse<Lhs, Rhs> > {
+ private:
+  typedef KroneckerProductBase<KroneckerProductSparse> Base;
+  using Base::m_A;
+  using Base::m_B;
+
+ public:
+  /*! \brief Constructor. */
+  KroneckerProductSparse(const Lhs& A, const Rhs& B) : Base(A, B) {}
+
+  /*! \brief Evaluate the Kronecker tensor product. */
+  template <typename Dest>
+  void evalTo(Dest& dst) const;
+};
+
+template <typename Lhs, typename Rhs>
+template <typename Dest>
+void KroneckerProduct<Lhs, Rhs>::evalTo(Dest& dst) const {
+  const int BlockRows = Rhs::RowsAtCompileTime, BlockCols = Rhs::ColsAtCompileTime;
+  const Index Br = m_B.rows(), Bc = m_B.cols();
+  for (Index i = 0; i < m_A.rows(); ++i)
+    for (Index j = 0; j < m_A.cols(); ++j)
+      Block<Dest, BlockRows, BlockCols>(dst, i * Br, j * Bc, Br, Bc) = m_A.coeff(i, j) * m_B;
+}
+
+template <typename Lhs, typename Rhs>
+template <typename Dest>
+void KroneckerProductSparse<Lhs, Rhs>::evalTo(Dest& dst) const {
+  Index Br = m_B.rows(), Bc = m_B.cols();
+  dst.resize(this->rows(), this->cols());
+  dst.resizeNonZeros(0);
+
+  // 1 - evaluate the operands if needed:
+  typedef typename internal::nested_eval<Lhs, Dynamic>::type Lhs1;
+  typedef internal::remove_all_t<Lhs1> Lhs1Cleaned;
+  const Lhs1 lhs1(m_A);
+  typedef typename internal::nested_eval<Rhs, Dynamic>::type Rhs1;
+  typedef internal::remove_all_t<Rhs1> Rhs1Cleaned;
+  const Rhs1 rhs1(m_B);
+
+  // 2 - construct respective iterators
+  typedef Eigen::InnerIterator<Lhs1Cleaned> LhsInnerIterator;
+  typedef Eigen::InnerIterator<Rhs1Cleaned> RhsInnerIterator;
+
+  // compute number of non-zeros per innervectors of dst
+  {
+    // TODO VectorXi is not necessarily big enough!
+    VectorXi nnzA = VectorXi::Zero(Dest::IsRowMajor ? m_A.rows() : m_A.cols());
+    for (Index kA = 0; kA < m_A.outerSize(); ++kA)
+      for (LhsInnerIterator itA(lhs1, kA); itA; ++itA) nnzA(Dest::IsRowMajor ? itA.row() : itA.col())++;
+
+    VectorXi nnzB = VectorXi::Zero(Dest::IsRowMajor ? m_B.rows() : m_B.cols());
+    for (Index kB = 0; kB < m_B.outerSize(); ++kB)
+      for (RhsInnerIterator itB(rhs1, kB); itB; ++itB) nnzB(Dest::IsRowMajor ? itB.row() : itB.col())++;
+
+    Matrix<int, Dynamic, Dynamic, ColMajor> nnzAB = nnzB * nnzA.transpose();
+    dst.reserve(VectorXi::Map(nnzAB.data(), nnzAB.size()));
+  }
+
+  for (Index kA = 0; kA < m_A.outerSize(); ++kA) {
+    for (Index kB = 0; kB < m_B.outerSize(); ++kB) {
+      for (LhsInnerIterator itA(lhs1, kA); itA; ++itA) {
+        for (RhsInnerIterator itB(rhs1, kB); itB; ++itB) {
+          Index i = itA.row() * Br + itB.row(), j = itA.col() * Bc + itB.col();
+          dst.insert(i, j) = itA.value() * itB.value();
+        }
+      }
+    }
+  }
+}
+
+namespace internal {
+
+template <typename Lhs_, typename Rhs_>
+struct traits<KroneckerProduct<Lhs_, Rhs_> > {
+  typedef remove_all_t<Lhs_> Lhs;
+  typedef remove_all_t<Rhs_> Rhs;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
+
+  enum {
+    Rows = size_at_compile_time(traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime),
+    Cols = size_at_compile_time(traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime),
+    MaxRows = size_at_compile_time(traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime),
+    MaxCols = size_at_compile_time(traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime)
+  };
+
+  typedef Matrix<Scalar, Rows, Cols> ReturnType;
+};
+
+template <typename Lhs_, typename Rhs_>
+struct traits<KroneckerProductSparse<Lhs_, Rhs_> > {
+  typedef MatrixXpr XprKind;
+  typedef remove_all_t<Lhs_> Lhs;
+  typedef remove_all_t<Rhs_> Rhs;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind,
+                                              scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret
+      StorageKind;
+  typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
+
+  enum {
+    LhsFlags = Lhs::Flags,
+    RhsFlags = Rhs::Flags,
+
+    RowsAtCompileTime = size_at_compile_time(traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime),
+    ColsAtCompileTime = size_at_compile_time(traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime),
+    MaxRowsAtCompileTime = size_at_compile_time(traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime),
+    MaxColsAtCompileTime = size_at_compile_time(traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime),
+
+    EvalToRowMajor = (int(LhsFlags) & int(RhsFlags) & RowMajorBit),
+    RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
+
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & RemovedBits) | EvalBeforeNestingBit,
+    CoeffReadCost = HugeCost
+  };
+
+  typedef SparseMatrix<Scalar, 0, StorageIndex> ReturnType;
+};
+
+}  // end namespace internal
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * Computes Kronecker tensor product of two dense matrices
+ *
+ * \warning If you want to replace a matrix by its Kronecker product
+ *          with some matrix, do \b NOT do this:
+ * \code
+ * A = kroneckerProduct(A,B); // bug!!! caused by aliasing effect
+ * \endcode
+ * instead, use eval() to work around this:
+ * \code
+ * A = kroneckerProduct(A,B).eval();
+ * \endcode
+ *
+ * \param a  Dense matrix a
+ * \param b  Dense matrix b
+ * \return   Kronecker tensor product of a and b
+ */
+template <typename A, typename B>
+KroneckerProduct<A, B> kroneckerProduct(const MatrixBase<A>& a, const MatrixBase<B>& b) {
+  return KroneckerProduct<A, B>(a.derived(), b.derived());
+}
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * Computes Kronecker tensor product of two matrices, at least one of
+ * which is sparse
+ *
+ * \warning If you want to replace a matrix by its Kronecker product
+ *          with some matrix, do \b NOT do this:
+ * \code
+ * A = kroneckerProduct(A,B); // bug!!! caused by aliasing effect
+ * \endcode
+ * instead, use eval() to work around this:
+ * \code
+ * A = kroneckerProduct(A,B).eval();
+ * \endcode
+ *
+ * \param a  Dense/sparse matrix a
+ * \param b  Dense/sparse matrix b
+ * \return   Kronecker tensor product of a and b, stored in a sparse
+ *           matrix
+ */
+template <typename A, typename B>
+KroneckerProductSparse<A, B> kroneckerProduct(const EigenBase<A>& a, const EigenBase<B>& b) {
+  return KroneckerProductSparse<A, B>(a.derived(), b.derived());
+}
+
+}  // end namespace Eigen
+
+#endif  // KRONECKER_TENSOR_PRODUCT_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
new file mode 100644
index 00000000000..25b91b5078c
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
@@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+#define EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \returns an expression of the coefficient-wise i0(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the first kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>, const Derived>
+    bessel_i0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i0e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the first kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i0e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i0e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>, const Derived>
+    bessel_i0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the first kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>, const Derived>
+    bessel_i1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the first kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i1e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i1e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>, const Derived>
+    bessel_i1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the second kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>, const Derived>
+    bessel_k0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the second kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k0e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k0e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>, const Derived>
+    bessel_k0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the second kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>, const Derived>
+    bessel_k1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the second kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k1e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k1e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>, const Derived>
+    bessel_k1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j0(\a x) to the given
+ * arrays.
+ *
+ * It returns the Bessel function of the first kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of j0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_j0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>, const Derived>
+    bessel_j0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y0(\a x) to the given
+ * arrays.
+ *
+ * It returns the Bessel function of the second kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of y0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_y0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>, const Derived>
+    bessel_y0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j1(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the first kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of j1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_j1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>, const Derived>
+    bessel_j1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y1(\a x) to the given
+ * arrays.
+ *
+ * It returns the Bessel function of the second kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of y1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_y1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>, const Derived>
+    bessel_y1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h
new file mode 100644
index 00000000000..45b70b45b48
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h
@@ -0,0 +1,71 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+#define EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_BFLOAT16_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
new file mode 100644
index 00000000000..d8774176c99
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
@@ -0,0 +1,323 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+#define EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order zero.
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i0;
+    return bessel_i0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_i0(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions. We also add
+    // the cost of an additional exp over i0e.
+    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0e_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i0e;
+    return bessel_i0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_i0e(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i1;
+    return bessel_i1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_i1(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions. We also add
+    // the cost of an additional exp over i1e.
+    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1e_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i1e;
+    return bessel_i1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_i1e(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_j0()
+ */
+template <typename Scalar>
+struct scalar_bessel_j0_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_j0;
+    return bessel_j0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_j0(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j0_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine and rsqrt cost.
+    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_y0()
+ */
+template <typename Scalar>
+struct scalar_bessel_y0_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_y0;
+    return bessel_y0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_y0(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y0_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine, rsqrt and j0 cost.
+    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the first kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1()
+ */
+template <typename Scalar>
+struct scalar_bessel_j1_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_j1;
+    return bessel_j1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_j1(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j1_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine and rsqrt cost.
+    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_y1_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_y1;
+    return bessel_y1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_y1(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y1_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine, rsqrt and j1 cost.
+    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the second
+ * kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k0;
+    return bessel_k0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_k0(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i0, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0e_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k0e;
+    return bessel_k0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_k0e(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i0, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the
+ * second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k1;
+    return bessel_k1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_k1(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i1, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1e_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k1e;
+    return bessel_k1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_k1e(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i1, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_FUNCTORS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
new file mode 100644
index 00000000000..ac1ea28f21b
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_HALF_H
+#define EIGEN_BESSELFUNCTIONS_HALF_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_HALF_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
new file mode 100644
index 00000000000..59bdaf1b786
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
@@ -0,0 +1,1638 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSEL_FUNCTIONS_H
+#define EIGEN_BESSEL_FUNCTIONS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+/****************************************************************************
+ * Implementation of Bessel function, based on Cephes                       *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct bessel_i0e_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0e {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_i0e<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i0ef.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i0ef();
+     *
+     * y = i0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        100000      3.7e-7      7.0e-8
+     * See i0f().
+     *
+     */
+
+    const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f,  -2.67079385394061173391E-7f,
+                       1.11738753912010371815E-6f,  -4.41673835845875056359E-6f, 1.64484480707288970893E-5f,
+                       -5.75419501008210370398E-5f, 1.88502885095841655729E-4f,  -5.76375574538582365885E-4f,
+                       1.63947561694133579842E-3f,  -4.32430999505057594430E-3f, 1.05464603945949983183E-2f,
+                       -2.37374148058994688156E-2f, 4.93052842396707084878E-2f,  -9.49010970480476444210E-2f,
+                       1.71620901522208775349E-1f,  -3.04682672343198398683E-1f, 6.76795274409476084995E-1f};
+
+    const float B[] = {3.39623202570838634515E-9f, 2.26666899049817806459E-8f, 2.04891858946906374183E-7f,
+                       2.89137052083475648297E-6f, 6.88975834691682398426E-5f, 3.36911647825569408990E-3f,
+                       8.04490411014108831608E-1f};
+    T y = pabs(x);
+    T y_le_eight = internal::pchebevl<T, 18>::run(pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A);
+    T y_gt_eight = pmul(internal::pchebevl<T, 7>::run(psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B), prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    return pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+  }
+};
+
+template <typename T>
+struct generic_i0e<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i0e.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i0e();
+     *
+     * y = i0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       5.4e-16     1.2e-16
+     * See i0().
+     *
+     */
+
+    const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,  -2.43127984654795469359E-16,
+                        1.71539128555513303061E-15,  -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+                        -4.85644678311192946090E-13, 2.95505266312963983461E-12,  -1.72682629144155570723E-11,
+                        9.67580903537323691224E-11,  -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+                        -1.30002500998624804212E-8,  6.04699502254191894932E-8,   -2.67079385394061173391E-7,
+                        1.11738753912010371815E-6,   -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+                        -5.75419501008210370398E-5,  1.88502885095841655729E-4,   -5.76375574538582365885E-4,
+                        1.63947561694133579842E-3,   -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+                        -2.37374148058994688156E-2,  4.93052842396707084878E-2,   -9.49010970480476444210E-2,
+                        1.71620901522208775349E-1,   -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+    const double B[] = {-7.23318048787475395456E-18, -4.83050448594418207126E-18, 4.46562142029675999901E-17,
+                        3.46122286769746109310E-17,  -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+                        1.77256013305652638360E-15,  3.81168066935262242075E-15,  -9.55484669882830764870E-15,
+                        -4.15056934728722208663E-14, 1.54008621752140982691E-14,  3.85277838274214270114E-13,
+                        7.18012445138366623367E-13,  -1.79417853150680611778E-12, -1.32158118404477131188E-11,
+                        -3.14991652796324136454E-11, 1.18891471078464383424E-11,  4.94060238822496958910E-10,
+                        3.39623202570838634515E-9,   2.26666899049817806459E-8,   2.04891858946906374183E-7,
+                        2.89137052083475648297E-6,   6.88975834691682398426E-5,   3.36911647825569408990E-3,
+                        8.04490411014108831608E-1};
+    T y = pabs(x);
+    T y_le_eight = internal::pchebevl<T, 30>::run(pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A);
+    T y_gt_eight = pmul(internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B), prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    return pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+  }
+};
+
+template <typename T>
+struct bessel_i0e_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_i0e<T>::run(x); }
+};
+
+template <typename Scalar>
+struct bessel_i0_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0 {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    return pmul(pexp(pabs(x)), generic_i0e<T, ScalarType>::run(x));
+  }
+};
+
+template <typename T>
+struct bessel_i0_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_i0<T>::run(x); }
+};
+
+template <typename Scalar>
+struct bessel_i1e_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i1e {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_i1e<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* i1ef.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i1ef();
+     *
+     * y = i1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.5e-6      1.5e-7
+     * See i1().
+     *
+     */
+    const float A[] = {9.38153738649577178388E-9f,  -4.44505912879632808065E-8f, 2.00329475355213526229E-7f,
+                       -8.56872026469545474066E-7f, 3.47025130813767847674E-6f,  -1.32731636560394358279E-5f,
+                       4.78156510755005422638E-5f,  -1.61760815825896745588E-4f, 5.12285956168575772895E-4f,
+                       -1.51357245063125314899E-3f, 4.15642294431288815669E-3f,  -1.05640848946261981558E-2f,
+                       2.47264490306265168283E-2f,  -5.29459812080949914269E-2f, 1.02643658689847095384E-1f,
+                       -1.76416518357834055153E-1f, 2.52587186443633654823E-1f};
+
+    const float B[] = {-3.83538038596423702205E-9f, -2.63146884688951950684E-8f, -2.51223623787020892529E-7f,
+                       -3.88256480887769039346E-6f, -1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
+                       7.78576235018280120474E-1f};
+
+    T y = pabs(x);
+    T y_le_eight = pmul(y, internal::pchebevl<T, 17>::run(pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A));
+    T y_gt_eight = pmul(internal::pchebevl<T, 7>::run(psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B), prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    y = pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+    return pselect(pcmp_lt(x, pset1<T>(0.0f)), pnegate(y), y);
+  }
+};
+
+template <typename T>
+struct generic_i1e<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i1e.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i1e();
+     *
+     * y = i1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       2.0e-15     2.0e-16
+     * See i1().
+     *
+     */
+    const double A[] = {2.77791411276104639959E-18,  -2.11142121435816608115E-17, 1.55363195773620046921E-16,
+                        -1.10559694773538630805E-15, 7.60068429473540693410E-15,  -5.04218550472791168711E-14,
+                        3.22379336594557470981E-13,  -1.98397439776494371520E-12, 1.17361862988909016308E-11,
+                        -6.66348972350202774223E-11, 3.62559028155211703701E-10,  -1.88724975172282928790E-9,
+                        9.38153738649577178388E-9,   -4.44505912879632808065E-8,  2.00329475355213526229E-7,
+                        -8.56872026469545474066E-7,  3.47025130813767847674E-6,   -1.32731636560394358279E-5,
+                        4.78156510755005422638E-5,   -1.61760815825896745588E-4,  5.12285956168575772895E-4,
+                        -1.51357245063125314899E-3,  4.15642294431288815669E-3,   -1.05640848946261981558E-2,
+                        2.47264490306265168283E-2,   -5.29459812080949914269E-2,  1.02643658689847095384E-1,
+                        -1.76416518357834055153E-1,  2.52587186443633654823E-1};
+    const double B[] = {7.51729631084210481353E-18,  4.41434832307170791151E-18,  -4.65030536848935832153E-17,
+                        -3.20952592199342395980E-17, 2.96262899764595013876E-16,  3.30820231092092828324E-16,
+                        -1.88035477551078244854E-15, -3.81440307243700780478E-15, 1.04202769841288027642E-14,
+                        4.27244001671195135429E-14,  -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+                        -7.19855177624590851209E-13, 2.03562854414708950722E-12,  1.41258074366137813316E-11,
+                        3.25260358301548823856E-11,  -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+                        -3.83538038596423702205E-9,  -2.63146884688951950684E-8,  -2.51223623787020892529E-7,
+                        -3.88256480887769039346E-6,  -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+                        7.78576235018280120474E-1};
+    T y = pabs(x);
+    T y_le_eight = pmul(y, internal::pchebevl<T, 29>::run(pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A));
+    T y_gt_eight = pmul(internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B), prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    y = pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+    return pselect(pcmp_lt(x, pset1<T>(0.0)), pnegate(y), y);
+  }
+};
+
+template <typename T>
+struct bessel_i1e_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_i1e<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_i1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i1 {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    return pmul(pexp(pabs(x)), generic_i1e<T, ScalarType>::run(x));
+  }
+};
+
+template <typename T>
+struct bessel_i1_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_i1<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_k0e_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0e {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_k0e<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0ef.c
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k0ef();
+     *
+     * y = k0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       8.1e-7      7.8e-8
+     * See k0().
+     *
+     */
+
+    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f, 2.28621210311945178607E-5f,
+                       1.26461541144692592338E-3f, 3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+                       -5.35327393233902768720E-1f};
+
+    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,  -4.66048989768794782956E-8f,
+                       2.76681363944501510342E-7f,  -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,  -3.14481013119645005427E-2f,
+                       2.44030308206595545468E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 7>::run(pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(generic_i0<T, float>::run(x), pnegate(plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pmul(pexp(x), x_le_two);
+    T x_gt_two = pmul(internal::pchebevl<T, 10>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x));
+    return pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, pselect(pcmp_le(x, two), x_le_two, x_gt_two));
+  }
+};
+
+template <typename T>
+struct generic_k0e<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0e.c
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k0e();
+     *
+     * y = k0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
+     * See k0().
+     *
+     */
+
+    const double A[] = {1.37446543561352307156E-16, 4.25981614279661018399E-14, 1.03496952576338420167E-11,
+                        1.90451637722020886025E-9,  2.53479107902614945675E-7,  2.28621210311945178607E-5,
+                        1.26461541144692592338E-3,  3.59799365153615016266E-2,  3.44289899924628486886E-1,
+                        -5.35327393233902768720E-1};
+    const double B[] = {5.30043377268626276149E-18,  -1.64758043015242134646E-17, 5.21039150503902756861E-17,
+                        -1.67823109680541210385E-16, 5.51205597852431940784E-16,  -1.84859337734377901440E-15,
+                        6.34007647740507060557E-15,  -2.22751332699166985548E-14, 8.03289077536357521100E-14,
+                        -2.98009692317273043925E-13, 1.14034058820847496303E-12,  -4.51459788337394416547E-12,
+                        1.85594911495471785253E-11,  -7.95748924447710747776E-11, 3.57739728140030116597E-10,
+                        -1.69753450938905987466E-9,  8.57403401741422608519E-9,   -4.66048989768794782956E-8,
+                        2.76681363944501510342E-7,   -1.83175552271911948767E-6,  1.39498137188764993662E-5,
+                        -1.28495495816278026384E-4,  1.56988388573005337491E-3,   -3.14481013119645005427E-2,
+                        2.44030308206595545468E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 10>::run(pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(generic_i0<T, double>::run(x), pmul(pset1<T>(-1.0), plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pmul(pexp(x), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k0e_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_k0e<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_k0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_k0<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0f.c
+     *	Modified Bessel function, third kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k0f();
+     *
+     * y = k0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns modified Bessel function of the third kind
+     * of order zero of the argument.
+     *
+     * The range is partitioned into the two intervals [0,8] and
+     * (8, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     * Tested at 2000 random points between 0 and 8.  Peak absolute
+     * error (relative when K0 > 1) was 1.46e-14; rms, 4.26e-15.
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       7.8e-7      8.5e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     *  K0 domain          x <= 0          MAXNUM
+     *
+     */
+
+    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f, 2.28621210311945178607E-5f,
+                       1.26461541144692592338E-3f, 3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+                       -5.35327393233902768720E-1f};
+
+    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,  -4.66048989768794782956E-8f,
+                       2.76681363944501510342E-7f,  -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,  -3.14481013119645005427E-2f,
+                       2.44030308206595545468E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 7>::run(pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(generic_i0<T, float>::run(x), pnegate(plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two =
+        pmul(pmul(pexp(pnegate(x)), internal::pchebevl<T, 10>::run(psub(pdiv(pset1<T>(8.0), x), two), B)), prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k0<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*
+     *
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k0();
+     *
+     * y = k0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
+     * See k0().
+     *
+     */
+    const double A[] = {1.37446543561352307156E-16, 4.25981614279661018399E-14, 1.03496952576338420167E-11,
+                        1.90451637722020886025E-9,  2.53479107902614945675E-7,  2.28621210311945178607E-5,
+                        1.26461541144692592338E-3,  3.59799365153615016266E-2,  3.44289899924628486886E-1,
+                        -5.35327393233902768720E-1};
+    const double B[] = {5.30043377268626276149E-18,  -1.64758043015242134646E-17, 5.21039150503902756861E-17,
+                        -1.67823109680541210385E-16, 5.51205597852431940784E-16,  -1.84859337734377901440E-15,
+                        6.34007647740507060557E-15,  -2.22751332699166985548E-14, 8.03289077536357521100E-14,
+                        -2.98009692317273043925E-13, 1.14034058820847496303E-12,  -4.51459788337394416547E-12,
+                        1.85594911495471785253E-11,  -7.95748924447710747776E-11, 3.57739728140030116597E-10,
+                        -1.69753450938905987466E-9,  8.57403401741422608519E-9,   -4.66048989768794782956E-8,
+                        2.76681363944501510342E-7,   -1.83175552271911948767E-6,  1.39498137188764993662E-5,
+                        -1.28495495816278026384E-4,  1.56988388573005337491E-3,   -3.14481013119645005427E-2,
+                        2.44030308206595545468E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 10>::run(pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(generic_i0<T, double>::run(x), pnegate(plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(pmul(pexp(-x), internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(8.0), x), two), B)), prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k0_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_k0<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_k1e_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1e {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_k1e<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* k1ef.c
+     *
+     *	Modified Bessel function, third kind, order one,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1ef();
+     *
+     * y = k1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order one of the argument:
+     *
+     *      k1e(x) = exp(x) * k1(x).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.9e-7      6.7e-8
+     * See k1().
+     *
+     */
+
+    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f, -1.73028895751305206302E-4f,
+                       -6.97572385963986435018E-3f, -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+                       1.52530022733894777053E0f};
+    const float B[] = {2.01504975519703286596E-9f,  -1.03457624656780970260E-8f, 5.74108412545004946722E-8f,
+                       -3.50196060308781257119E-7f, 2.40648494783721712015E-6f,  -1.93619797416608296024E-5f,
+                       1.95215518471351631108E-4f,  -2.85781685962277938680E-3f, 1.03923736576817238437E-1f,
+                       2.72062619048444266945E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pmul(x_le_two, pexp(x));
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(internal::pchebevl<T, 10>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k1e<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k1e.c
+     *
+     *	Modified Bessel function, third kind, order one,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k1e();
+     *
+     * y = k1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order one of the argument:
+     *
+     *      k1e(x) = exp(x) * k1(x).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       7.8e-16     1.2e-16
+     * See k1().
+     *
+     */
+    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15, -6.66690169419932900609E-13,
+                        -1.41148839263352776110E-10, -2.21338763073472585583E-8,  -2.43340614156596823496E-6,
+                        -1.73028895751305206302E-4,  -6.97572385963986435018E-3,  -1.22611180822657148235E-1,
+                        -3.53155960776544875667E-1,  1.52530022733894777053E0};
+    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,  -5.68946255844285935196E-17,
+                        1.83809354436663880070E-16,  -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,  -8.97670518232499435011E-14,
+                        3.34841966607842919884E-13,  -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,  -4.19035475934189648750E-10,
+                        2.01504975519703286596E-9,   -1.03457624656780970260E-8,  5.74108412545004946722E-8,
+                        -3.50196060308781257119E-7,  2.40648494783721712015E-6,   -1.93619797416608296024E-5,
+                        1.95215518471351631108E-4,   -2.85781685962277938680E-3,  1.03923736576817238437E-1,
+                        2.72062619048444266945E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pmul(x_le_two, pexp(x));
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k1e_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_k1e<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_k1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_k1<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* k1f.c
+     *	Modified Bessel function, third kind, order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1f();
+     *
+     * y = k1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Computes the modified Bessel function of the third kind
+     * of order one of the argument.
+     *
+     * The range is partitioned into the two intervals [0,2] and
+     * (2, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     * k1 domain          x <= 0          MAXNUM
+     *
+     */
+
+    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f, -1.73028895751305206302E-4f,
+                       -6.97572385963986435018E-3f, -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+                       1.52530022733894777053E0f};
+    const float B[] = {2.01504975519703286596E-9f,  -1.03457624656780970260E-8f, 5.74108412545004946722E-8f,
+                       -3.50196060308781257119E-7f, 2.40648494783721712015E-6f,  -1.93619797416608296024E-5f,
+                       1.95215518471351631108E-4f,  -2.85781685962277938680E-3f, 1.03923736576817238437E-1f,
+                       2.72062619048444266945E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two =
+        pmul(pexp(pnegate(x)), pmul(internal::pchebevl<T, 10>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x)));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k1<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k1.c
+     *	Modified Bessel function, third kind, order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1f();
+     *
+     * y = k1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Computes the modified Bessel function of the third kind
+     * of order one of the argument.
+     *
+     * The range is partitioned into the two intervals [0,2] and
+     * (2, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     * k1 domain          x <= 0          MAXNUM
+     *
+     */
+    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15, -6.66690169419932900609E-13,
+                        -1.41148839263352776110E-10, -2.21338763073472585583E-8,  -2.43340614156596823496E-6,
+                        -1.73028895751305206302E-4,  -6.97572385963986435018E-3,  -1.22611180822657148235E-1,
+                        -3.53155960776544875667E-1,  1.52530022733894777053E0};
+    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,  -5.68946255844285935196E-17,
+                        1.83809354436663880070E-16,  -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,  -8.97670518232499435011E-14,
+                        3.34841966607842919884E-13,  -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,  -4.19035475934189648750E-10,
+                        2.01504975519703286596E-9,   -1.03457624656780970260E-8,  5.74108412545004946722E-8,
+                        -3.50196060308781257119E-7,  2.40648494783721712015E-6,   -1.93619797416608296024E-5,
+                        1.95215518471351631108E-4,   -2.85781685962277938680E-3,  1.03923736576817238437E-1,
+                        2.72062619048444266945E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(pexp(-x), pmul(internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x)));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k1_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_k1<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_j0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j0 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_j0<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j0f.c
+     *	Bessel function of order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, j0f();
+     *
+     * y = j0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order zero of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval the following polynomial
+     * approximation is used:
+     *
+     *
+     *        2         2         2
+     * (w - r  ) (w - r  ) (w - r  ) P(w)
+     *       1         2         3
+     *
+     *            2
+     * where w = x  and the three r's are zeros of the function.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x R(1/x^2) - pi/4.  The function is
+     *
+     *   j0(x) = Modulus(x) cos( Phase(x) ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 2        100000      1.3e-7      3.6e-8
+     *    IEEE      2, 32       100000      1.9e-7      5.4e-8
+     *
+     */
+
+    const float JP[] = {-6.068350350393235E-008f, 6.388945720783375E-006f, -3.969646342510940E-004f,
+                        1.332913422519003E-002f, -1.729150680240724E-001f};
+    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,  -2.145007480346739E-001f,
+                        1.197549369473540E-001f,  -3.560281861530129E-003f, -4.969382655296620E-002f,
+                        -3.355424622293709E-006f, 7.978845717621440E-001f};
+    const float PH[] = {3.242077816988247E+001f,  -3.630592630518434E+001f, 1.756221482109099E+001f,
+                        -4.974978466280903E+000f, 1.001973420681837E+000f,  -1.939906941791308E-001f,
+                        6.490598792654666E-002f,  -1.249992184872738E-001f};
+    const T DR1 = pset1<T>(5.78318596294678452118f);
+    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_two = pselect(pcmp_lt(y, pset1<T>(1.0e-3f)), pmadd(z, pset1<T>(-0.25f), pset1<T>(1.0f)),
+                         pmul(psub(z, DR1), internal::ppolevl<T, 4>::run(z, JP)));
+    T q = pdiv(pset1<T>(1.0f), y);
+    T w = prsqrt(y);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+    w = pmul(q, q);
+    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH), NEG_PIO4F);
+    T y_gt_two = pmul(p, pcos(padd(yn, y)));
+    return pselect(pcmp_le(y, pset1<T>(2.0)), y_le_two, y_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_j0<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j0.c
+     *	Bessel function of order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, j0();
+     *
+     * y = j0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order zero of the argument.
+     *
+     * The domain is divided into the intervals [0, 5] and
+     * (5, infinity). In the first interval the following rational
+     * approximation is used:
+     *
+     *
+     *        2         2
+     * (w - r  ) (w - r  ) P (w) / Q (w)
+     *       1         2    3       8
+     *
+     *            2
+     * where w = x  and the two r's are zeros of the function.
+     *
+     * In the second interval, the Hankel asymptotic expansion
+     * is employed with two rational functions of degree 6/6
+     * and 7/7.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain     # trials      peak         rms
+     *    DEC       0, 30       10000       4.4e-17     6.3e-18
+     *    IEEE      0, 30       60000       4.2e-16     1.1e-16
+     *
+     */
+    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2, 1.23953371646414299388E0,
+                         5.44725003058768775090E0,  8.74716500199817011941E0,  5.30324038235394892183E0,
+                         9.99999999999999997821E-1};
+    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2, 1.25352743901058953537E0,
+                         5.47097740330417105182E0,  8.76190883237069594232E0,  5.30605288235394617618E0,
+                         1.00000000000000000218E0};
+    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0, -1.95539544257735972385E1,
+                         -9.32060152123768231369E1,  -1.77681167980488050595E2, -1.47077505154951170175E2,
+                         -5.14105326766599330220E1,  -6.05014350600728481186E0};
+    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1, 8.56430025976980587198E2,
+                         3.88240183605401609683E3, 7.24046774195652478189E3, 5.93072701187316984827E3,
+                         2.06209331660327847417E3, 2.42005740240291393179E2};
+    const double RP[] = {-4.79443220978201773821E9, 1.95617491946556577543E12, -2.49248344360967716204E14,
+                         9.70862251047306323952E15};
+    const double RQ[] = {1.00000000000000000000E0,  4.99563147152651017219E2,  1.73785401676374683123E5,
+                         4.84409658339962045305E7,  1.11855537045356834862E10, 2.11277520115489217587E12,
+                         3.10518229857422583814E14, 3.18121955943204943306E16, 1.71086294081043136091E18};
+    const T DR1 = pset1<T>(5.78318596294678452118E0);
+    const T DR2 = pset1<T>(3.04712623436620863991E1);
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096);    /* pi / 4 */
+
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_five = pselect(pcmp_lt(y, pset1<T>(1.0e-5)), pmadd(z, pset1<T>(-0.25), pset1<T>(1.0)),
+                          pmul(pmul(psub(z, DR1), psub(z, DR2)),
+                               pdiv(internal::ppolevl<T, 3>::run(z, RP), internal::ppolevl<T, 8>::run(z, RQ))));
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(internal::ppolevl<T, 6>::run(s, PP), internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(internal::ppolevl<T, 7>::run(s, QP), internal::ppolevl<T, 7>::run(s, QQ));
+    T yn = padd(y, NEG_PIO4);
+    T w = pdiv(pset1<T>(-5.0), y);
+    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_j0_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_j0<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_y0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y0 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_y0<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j0f.c
+     * 	Bessel function of the second kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, y0f();
+     *
+     * y = y0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind, of order
+     * zero, of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *
+     *                  2         2         2
+     * y0(x)  =  (w - r  ) (w - r  ) (w - r  ) R(x)  +  2/pi ln(x) j0(x).
+     *                 1         2         3
+     *
+     * Thus a call to j0() is required.  The three zeros are removed
+     * from R(x) to improve its numerical stability.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x S(1/x^2) - pi/4.  Then the function is
+     *
+     *   y0(x) = Modulus(x) sin( Phase(x) ).
+     *
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *  Absolute error, when y0(x) < 1; else relative error:
+     *
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,  2       100000      2.4e-7      3.4e-8
+     *    IEEE      2, 32       100000      1.8e-7      5.3e-8
+     *
+     */
+
+    const float YP[] = {9.454583683980369E-008f, -9.413212653797057E-006f, 5.344486707214273E-004f,
+                        -1.584289289821316E-002f, 1.707584643733568E-001f};
+    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,  -2.145007480346739E-001f,
+                        1.197549369473540E-001f,  -3.560281861530129E-003f, -4.969382655296620E-002f,
+                        -3.355424622293709E-006f, 7.978845717621440E-001f};
+    const float PH[] = {3.242077816988247E+001f,  -3.630592630518434E+001f, 1.756221482109099E+001f,
+                        -4.974978466280903E+000f, 1.001973420681837E+000f,  -1.939906941791308E-001f,
+                        6.490598792654666E-002f,  -1.249992184872738E-001f};
+    const T YZ1 = pset1<T>(0.43221455686510834878f);
+    const T TWOOPI = pset1<T>(0.636619772367581343075535f); /* 2 / pi */
+    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f);  /* -pi / 4 */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+    T z = pmul(x, x);
+    T x_le_two = pmul(TWOOPI, pmul(plog(x), generic_j0<T, float>::run(x)));
+    x_le_two = pmadd(psub(z, YZ1), internal::ppolevl<T, 4>::run(z, YP), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_two);
+    T q = pdiv(pset1<T>(1.0), x);
+    T w = prsqrt(x);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+    T u = pmul(q, q);
+    T xn = pmadd(q, internal::ppolevl<T, 7>::run(u, PH), NEG_PIO4F);
+    T x_gt_two = pmul(p, psin(padd(xn, x)));
+    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_y0<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j0.c
+     *	Bessel function of the second kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y0();
+     *
+     * y = y0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind, of order
+     * zero, of the argument.
+     *
+     * The domain is divided into the intervals [0, 5] and
+     * (5, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *   y0(x)  = R(x)  +   2 * log(x) * j0(x) / PI.
+     * Thus a call to j0() is required.
+     *
+     * In the second interval, the Hankel asymptotic expansion
+     * is employed with two rational functions of degree 6/6
+     * and 7/7.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *  Absolute error, when y0(x) < 1; else relative error:
+     *
+     * arithmetic   domain     # trials      peak         rms
+     *    DEC       0, 30        9400       7.0e-17     7.9e-18
+     *    IEEE      0, 30       30000       1.3e-15     1.6e-16
+     *
+     */
+    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2, 1.23953371646414299388E0,
+                         5.44725003058768775090E0,  8.74716500199817011941E0,  5.30324038235394892183E0,
+                         9.99999999999999997821E-1};
+    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2, 1.25352743901058953537E0,
+                         5.47097740330417105182E0,  8.76190883237069594232E0,  5.30605288235394617618E0,
+                         1.00000000000000000218E0};
+    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0, -1.95539544257735972385E1,
+                         -9.32060152123768231369E1,  -1.77681167980488050595E2, -1.47077505154951170175E2,
+                         -5.14105326766599330220E1,  -6.05014350600728481186E0};
+    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1, 8.56430025976980587198E2,
+                         3.88240183605401609683E3, 7.24046774195652478189E3, 5.93072701187316984827E3,
+                         2.06209331660327847417E3, 2.42005740240291393179E2};
+    const double YP[] = {1.55924367855235737965E4,   -1.46639295903971606143E7, 5.43526477051876500413E9,
+                         -9.82136065717911466409E11, 8.75906394395366999549E13, -3.46628303384729719441E15,
+                         4.42733268572569800351E16,  -1.84950800436986690637E16};
+    const double YQ[] = {1.00000000000000000000E0,  1.04128353664259848412E3,  6.26107330137134956842E5,
+                         2.68919633393814121987E8,  8.64002487103935000337E10, 2.02979612750105546709E13,
+                         3.17157752842975028269E15, 2.50596256172653059228E17};
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535);  /* 2 / pi */
+    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096);    /* -pi / 4 */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_five = pdiv(internal::ppolevl<T, 7>::run(z, YP), internal::ppolevl<T, 7>::run(z, YQ));
+    x_le_five = pmadd(pmul(TWOOPI, plog(x)), generic_j0<T, double>::run(x), x_le_five);
+    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(internal::ppolevl<T, 6>::run(s, PP), internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(internal::ppolevl<T, 7>::run(s, QP), internal::ppolevl<T, 7>::run(s, QQ));
+    T xn = padd(x, NEG_PIO4);
+    T w = pdiv(pset1<T>(5.0), x);
+    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_y0_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_y0<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_j1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j1 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_j1<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j1f.c
+     *	Bessel function of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, j1f();
+     *
+     * y = j1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order one of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a polynomial approximation
+     *        2
+     * (w - r  ) x P(w)
+     *       1
+     *                     2
+     * is used, where w = x  and r is the first zero of the function.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x R(1/x^2) - 3pi/4.  The function is
+     *
+     *   j0(x) = Modulus(x) cos( Phase(x) ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak       rms
+     *    IEEE      0,  2       100000       1.2e-7     2.5e-8
+     *    IEEE      2, 32       100000       2.0e-7     5.3e-8
+     *
+     *
+     */
+
+    const float JP[] = {-4.878788132172128E-009f, 6.009061827883699E-007f, -4.541343896997497E-005f,
+                        1.937383947804541E-003f, -3.405537384615824E-002f};
+    const float MO1[] = {6.913942741265801E-002f,  -2.284801500053359E-001f, 3.138238455499697E-001f,
+                         -2.102302420403875E-001f, 5.435364690523026E-003f,  1.493389585089498E-001f,
+                         4.976029650847191E-006f,  7.978845453073848E-001f};
+    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,  -2.485774108720340E+001f,
+                         7.222973196770240E+000f,  -1.544842782180211E+000f, 3.503787691653334E-001f,
+                         -1.637986776941202E-001f, 3.749989509080821E-001f};
+    const T Z1 = pset1<T>(1.46819706421238932572E1f);
+    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f); /* -3*pi/4 */
+
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_two = pmul(psub(z, Z1), pmul(x, internal::ppolevl<T, 4>::run(z, JP)));
+    T q = pdiv(pset1<T>(1.0f), y);
+    T w = prsqrt(y);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+    w = pmul(q, q);
+    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+    T y_gt_two = pmul(p, pcos(padd(yn, y)));
+    // j1 is an odd function. This implementation differs from cephes to
+    // take this fact in to account. Cephes returns -j1(x) for y > 2 range.
+    y_gt_two = pselect(pcmp_lt(x, pset1<T>(0.0f)), pnegate(y_gt_two), y_gt_two);
+    return pselect(pcmp_le(y, pset1<T>(2.0f)), y_le_two, y_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_j1<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j1.c
+     *	Bessel function of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, j1();
+     *
+     * y = j1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order one of the argument.
+     *
+     * The domain is divided into the intervals [0, 8] and
+     * (8, infinity). In the first interval a 24 term Chebyshev
+     * expansion is used. In the second, the asymptotic
+     * trigonometric representation is employed using two
+     * rational functions of degree 5/5.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    DEC       0, 30       10000       4.0e-17     1.1e-17
+     *    IEEE      0, 30       30000       2.6e-16     1.1e-16
+     *
+     */
+    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2, 1.12719608129684925192E0,
+                         5.11207951146807644818E0,  8.42404590141772420927E0,  5.21451598682361504063E0,
+                         1.00000000000000000254E0};
+    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2, 1.10514232634061696926E0,
+                         5.07386386128601488557E0,  8.39985554327604159757E0,  5.20982848682361821619E0,
+                         9.99999999999999997461E-1};
+    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0, 7.58238284132545283818E1,
+                         3.66779609360150777800E2,  7.10856304998926107277E2, 5.97489612400613639965E2,
+                         2.11688757100572135698E2,  2.52070205858023719784E1};
+    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1, 1.05644886038262816351E3,
+                         4.98641058337653607651E3, 9.56231892404756170795E3, 7.99704160447350683650E3,
+                         2.82619278517639096600E3, 3.36093607810698293419E2};
+    const double RP[] = {-8.99971225705559398224E8, 4.52228297998194034323E11, -7.27494245221818276015E13,
+                         3.68295732863852883286E15};
+    const double RQ[] = {1.00000000000000000000E0,  6.20836478118054335476E2,  2.56987256757748830383E5,
+                         8.35146791431949253037E7,  2.21511595479792499675E10, 4.74914122079991414898E12,
+                         7.84369607876235854894E14, 8.95222336184627338078E16, 5.32278620332680085395E18};
+    const T Z1 = pset1<T>(1.46819706421238932572E1);
+    const T Z2 = pset1<T>(4.92184563216946036703E1);
+    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885); /* -3*pi/4 */
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_five = pdiv(internal::ppolevl<T, 3>::run(z, RP), internal::ppolevl<T, 8>::run(z, RQ));
+    y_le_five = pmul(pmul(pmul(y_le_five, x), psub(z, Z1)), psub(z, Z2));
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(internal::ppolevl<T, 6>::run(s, PP), internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(internal::ppolevl<T, 7>::run(s, QP), internal::ppolevl<T, 7>::run(s, QQ));
+    T yn = padd(y, NEG_THPIO4);
+    T w = pdiv(pset1<T>(-5.0), y);
+    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+    // j1 is an odd function. This implementation differs from cephes to
+    // take this fact in to account. Cephes returns -j1(x) for y > 5 range.
+    y_gt_five = pselect(pcmp_lt(x, pset1<T>(0.0)), pnegate(y_gt_five), y_gt_five);
+    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_j1_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_j1<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_y1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y1 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_y1<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j1f.c
+     *	Bessel function of second kind of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y1();
+     *
+     * y = y1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind of order one
+     * of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *
+     *                  2
+     * y0(x)  =  (w - r  ) x R(x^2)  +  2/pi (ln(x) j1(x) - 1/x) .
+     *                 1
+     *
+     * Thus a call to j1() is required.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x S(1/x^2) - 3pi/4.  Then the function is
+     *
+     *   y0(x) = Modulus(x) sin( Phase(x) ).
+     *
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    IEEE      0,  2       100000       2.2e-7     4.6e-8
+     *    IEEE      2, 32       100000       1.9e-7     5.3e-8
+     *
+     * (error criterion relative when |y1| > 1).
+     *
+     */
+
+    const float YP[] = {8.061978323326852E-009f, -9.496460629917016E-007f, 6.719543806674249E-005f,
+                        -2.641785726447862E-003f, 4.202369946500099E-002f};
+    const float MO1[] = {6.913942741265801E-002f,  -2.284801500053359E-001f, 3.138238455499697E-001f,
+                         -2.102302420403875E-001f, 5.435364690523026E-003f,  1.493389585089498E-001f,
+                         4.976029650847191E-006f,  7.978845453073848E-001f};
+    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,  -2.485774108720340E+001f,
+                         7.222973196770240E+000f,  -1.544842782180211E+000f, 3.503787691653334E-001f,
+                         -1.637986776941202E-001f, 3.749989509080821E-001f};
+    const T YO1 = pset1<T>(4.66539330185668857532f);
+    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f); /* -3*pi/4 */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535f);   /* 2/pi */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_two = pmul(psub(z, YO1), internal::ppolevl<T, 4>::run(z, YP));
+    x_le_two = pmadd(x_le_two, x, pmul(TWOOPI, pmadd(generic_j1<T, float>::run(x), plog(x), pdiv(pset1<T>(-1.0f), x))));
+    x_le_two = pselect(pcmp_lt(x, pset1<T>(0.0f)), NEG_MAXNUM, x_le_two);
+
+    T q = pdiv(pset1<T>(1.0), x);
+    T w = prsqrt(x);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+    w = pmul(q, q);
+    T xn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+    T x_gt_two = pmul(p, psin(padd(xn, x)));
+    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_y1<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j1.c
+     *	Bessel function of second kind of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y1();
+     *
+     * y = y1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind of order one
+     * of the argument.
+     *
+     * The domain is divided into the intervals [0, 8] and
+     * (8, infinity). In the first interval a 25 term Chebyshev
+     * expansion is used, and a call to j1() is required.
+     * In the second, the asymptotic trigonometric representation
+     * is employed using two rational functions of degree 5/5.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    DEC       0, 30       10000       8.6e-17     1.3e-17
+     *    IEEE      0, 30       30000       1.0e-15     1.3e-16
+     *
+     * (error criterion relative when |y1| > 1).
+     *
+     */
+    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2, 1.12719608129684925192E0,
+                         5.11207951146807644818E0,  8.42404590141772420927E0,  5.21451598682361504063E0,
+                         1.00000000000000000254E0};
+    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2, 1.10514232634061696926E0,
+                         5.07386386128601488557E0,  8.39985554327604159757E0,  5.20982848682361821619E0,
+                         9.99999999999999997461E-1};
+    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0, 7.58238284132545283818E1,
+                         3.66779609360150777800E2,  7.10856304998926107277E2, 5.97489612400613639965E2,
+                         2.11688757100572135698E2,  2.52070205858023719784E1};
+    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1, 1.05644886038262816351E3,
+                         4.98641058337653607651E3, 9.56231892404756170795E3, 7.99704160447350683650E3,
+                         2.82619278517639096600E3, 3.36093607810698293419E2};
+    const double YP[] = {1.26320474790178026440E9,   -6.47355876379160291031E11, 1.14509511541823727583E14,
+                         -8.12770255501325109621E15, 2.02439475713594898196E17,  -7.78877196265950026825E17};
+    const double YQ[] = {1.00000000000000000000E0,  5.94301592346128195359E2,  2.35564092943068577943E5,
+                         7.34811944459721705660E7,  1.87601316108706159478E10, 3.88231277496238566008E12,
+                         6.20557727146953693363E14, 6.87141087355300489866E16, 3.97270608116560655612E18};
+    const T SQ2OPI = pset1<T>(.79788456080286535588);
+    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885); /* -3*pi/4 */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535);  /* 2/pi */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_five = pdiv(internal::ppolevl<T, 5>::run(z, YP), internal::ppolevl<T, 8>::run(z, YQ));
+    x_le_five =
+        pmadd(x_le_five, x, pmul(TWOOPI, pmadd(generic_j1<T, double>::run(x), plog(x), pdiv(pset1<T>(-1.0), x))));
+
+    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(internal::ppolevl<T, 6>::run(s, PP), internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(internal::ppolevl<T, 7>::run(s, QP), internal::ppolevl<T, 7>::run(s, QQ));
+    T xn = padd(x, NEG_THPIO4);
+    T w = pdiv(pset1<T>(5.0), x);
+    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_y1_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_y1<T>::run(x); }
+};
+
+}  // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0, Scalar) bessel_i0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0e, Scalar) bessel_i0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1, Scalar) bessel_i1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1e, Scalar) bessel_i1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0, Scalar) bessel_k0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0e, Scalar) bessel_k0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1, Scalar) bessel_k1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1e, Scalar) bessel_k1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j0, Scalar) bessel_j0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_j0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y0, Scalar) bessel_y0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_y0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j1, Scalar) bessel_j1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_j1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y1, Scalar) bessel_y1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_y1, Scalar)::run(x);
+}
+
+}  // end namespace numext
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSEL_FUNCTIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
new file mode 100644
index 00000000000..1c325fc4eb1
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
@@ -0,0 +1,108 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+#define EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_i0(const Packet& x) {
+  return numext::bessel_i0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_i0e(const Packet& x) {
+  return numext::bessel_i0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_i1(const Packet& x) {
+  return numext::bessel_i1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_i1e(const Packet& x) {
+  return numext::bessel_i1e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_j0(const Packet& x) {
+  return numext::bessel_j0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_j1(const Packet& x) {
+  return numext::bessel_j1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_y0(const Packet& x) {
+  return numext::bessel_y0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_y1(const Packet& x) {
+  return numext::bessel_y1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_k0(const Packet& x) {
+  return numext::bessel_k0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_k0e(const Packet& x) {
+  return numext::bessel_k0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_k1(const Packet& x) {
+  return numext::bessel_k1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_k1e(const Packet& x) {
+  return numext::bessel_k1e(x);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_PACKETMATH_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
new file mode 100644
index 00000000000..9119335aea6
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
@@ -0,0 +1,71 @@
+#ifndef HIP_VECTOR_COMPATIBILITY_H
+#define HIP_VECTOR_COMPATIBILITY_H
+
+namespace hip_impl {
+template <typename, typename, unsigned int>
+struct Scalar_accessor;
+}  // end namespace hip_impl
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+#define HIP_SCALAR_ACCESSOR_BUILDER(NAME)           \
+  template <typename T, typename U, unsigned int n> \
+  struct NAME<hip_impl::Scalar_accessor<T, U, n>> : NAME<T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(NAME)                              \
+  template <typename T, typename U, unsigned int n>                           \
+  struct NAME##_impl<hip_impl::Scalar_accessor<T, U, n>> : NAME##_impl<T> {}; \
+  template <typename T, typename U, unsigned int n>                           \
+  struct NAME##_retval<hip_impl::Scalar_accessor<T, U, n>> : NAME##_retval<T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(NAME)                                \
+  template <typename T, typename U, unsigned int n, IgammaComputationMode mode> \
+  struct NAME<hip_impl::Scalar_accessor<T, U, n>, mode> : NAME<T, mode> {};
+
+#if EIGEN_HAS_C99_MATH
+HIP_SCALAR_ACCESSOR_BUILDER(betainc_helper)
+HIP_SCALAR_ACCESSOR_BUILDER(incbeta_cfe)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erf)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erfc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igammac)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(lgamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(ndtri)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(polygamma)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_generic_impl)
+#endif
+
+HIP_SCALAR_ACCESSOR_BUILDER(digamma_impl_maybe_poly)
+HIP_SCALAR_ACCESSOR_BUILDER(zeta_impl_series)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(betainc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(digamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(gamma_sample_der_alpha)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma_der_a)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(zeta)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_series_impl)
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igammac_cf_impl)
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // HIP_VECTOR_COMPATIBILITY_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/InternalHeaderCheck.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/InternalHeaderCheck.h
new file mode 100644
index 00000000000..a5ef51a840f
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/InternalHeaderCheck.h
@@ -0,0 +1,4 @@
+#ifndef EIGEN_SPECIALFUNCTIONS_MODULE_H
+#error \
+    "Please include unsupported/Eigen/SpecialFunctions instead of including headers inside the src directory directly."
+#endif
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
new file mode 100644
index 00000000000..0920d274ffb
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise incomplete gamma function.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igammac(), Eigen::lgamma()
+ */
+template <typename Derived, typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>,
+                                               const Derived, const ExponentDerived>
+igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived,
+                              const ExponentDerived>(a.derived(), x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma_der_a(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise derivative of the incomplete
+ * gamma function with respect to the parameter a.
+ *
+ * \note This function supports only float and double scalar types in c++11
+ * mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations
+ * of igamma_der_a(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template <typename Derived, typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>,
+                                               const Derived, const ExponentDerived>
+igamma_der_a(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived,
+                              const ExponentDerived>(a.derived(), x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise gamma_sample_der_alpha(\a alpha, \a sample) to the given
+ * arrays.
+ *
+ * This function computes the coefficient-wise derivative of the sample
+ * of a Gamma(alpha, 1) random variable with respect to the parameter alpha.
+ *
+ * \note This function supports only float and double scalar types in c++11
+ * mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations
+ * of gamma_sample_der_alpha(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template <typename AlphaDerived, typename SampleDerived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>,
+                         const AlphaDerived, const SampleDerived>
+    gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen::ArrayBase<SampleDerived>& sample) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>,
+                              const AlphaDerived, const SampleDerived>(alpha.derived(), sample.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise complementary incomplete gamma function.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template <typename Derived, typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>,
+                                               const Derived, const ExponentDerived>
+igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived,
+                              const ExponentDerived>(a.derived(), x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays.
+ *
+ * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::digamma()
+ */
+// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+// * \sa ArrayBase::polygamma()
+template <typename DerivedN, typename DerivedX>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>,
+                                               const DerivedN, const DerivedX>
+polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN,
+                              const DerivedX>(n.derived(), x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::betainc(), Eigen::lgamma()
+ */
+template <typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>,
+                                                const ArgADerived, const ArgBDerived, const ArgXDerived>
+betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b,
+        const Eigen::ArrayBase<ArgXDerived>& x) {
+  return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived,
+                               const ArgBDerived, const ArgXDerived>(a.derived(), b.derived(), x.derived());
+}
+
+/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays.
+ *
+ * It returns the Riemann zeta function of two arguments \a x and \a q:
+ *
+ * \param x is the exponent, it must be > 1
+ * \param q is the shift, it must be > 0
+ *
+ * \note This function supports only float and double scalar types. To support other scalar types, the user has
+ * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+ *
+ * \sa ArrayBase::zeta()
+ */
+template <typename DerivedX, typename DerivedQ>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+    zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX,
+                              const DerivedQ>(x.derived(), q.derived());
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h
new file mode 100644
index 00000000000..90babaea385
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h
@@ -0,0 +1,73 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+#define EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 lgamma(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 digamma(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 zeta(const Eigen::bfloat16& x, const Eigen::bfloat16& q) {
+  return Eigen::bfloat16(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 polygamma(const Eigen::bfloat16& n, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erf(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::erf(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erfc(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 ndtri(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::ndtri(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma_der_a(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 gamma_sample_der_alpha(const Eigen::bfloat16& alpha,
+                                                                             const Eigen::bfloat16& sample) {
+  return Eigen::bfloat16(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igammac(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 betainc(const Eigen::bfloat16& a, const Eigen::bfloat16& b,
+                                                              const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
new file mode 100644
index 00000000000..483c9ae0278
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -0,0 +1,326 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the incomplete gamma function igamma(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igamma
+ */
+template <typename Scalar>
+struct scalar_igamma_op : binary_op_base<Scalar, Scalar> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+    using numext::igamma;
+    return igamma(a, x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma(a, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_igamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the derivative of the incomplete gamma
+ * function igamma_der_a(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igamma_der_a
+ */
+template <typename Scalar>
+struct scalar_igamma_der_a_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+    using numext::igamma_der_a;
+    return igamma_der_a(a, x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma_der_a(a, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_igamma_der_a_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma
+    Cost = 40 * NumTraits<Scalar>::MulCost + 20 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammaDerA
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the derivative of the sample
+ * of a Gamma(alpha, 1) random variable with respect to the parameter alpha
+ * gamma_sample_der_alpha(alpha, sample)
+ *
+ * \sa class CwiseBinaryOp, Cwise::gamma_sample_der_alpha
+ */
+template <typename Scalar>
+struct scalar_gamma_sample_der_alpha_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& alpha, const Scalar& sample) const {
+    using numext::gamma_sample_der_alpha;
+    return gamma_sample_der_alpha(alpha, sample);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& alpha, const Packet& sample) const {
+    return internal::pgamma_sample_der_alpha(alpha, sample);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_gamma_sample_der_alpha_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma, minus the lgamma cost (the lgamma cancels out)
+    Cost = 30 * NumTraits<Scalar>::MulCost + 15 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasGammaSampleDerAlpha
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igammac
+ */
+template <typename Scalar>
+struct scalar_igammac_op : binary_op_base<Scalar, Scalar> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+    using numext::igammac;
+    return igammac(a, x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigammac(a, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_igammac_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammac
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the incomplete beta integral betainc(a, b, x)
+ *
+ */
+template <typename Scalar>
+struct scalar_betainc_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x, const Scalar& a,
+                                                                const Scalar& b) const {
+    using numext::betainc;
+    return betainc(x, a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const {
+    return internal::pbetainc(x, a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_betainc_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBetaInc
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the natural log of the absolute
+ * value of Gamma of a scalar
+ * \sa class CwiseUnaryOp, Cwise::lgamma()
+ */
+template <typename Scalar>
+struct scalar_lgamma_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    using numext::lgamma;
+    return lgamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasLGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
+ * \sa class CwiseUnaryOp, Cwise::digamma()
+ */
+template <typename Scalar>
+struct scalar_digamma_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    using numext::digamma;
+    return digamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasDiGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Riemann Zeta function of two arguments.
+ * \sa class CwiseUnaryOp, Cwise::zeta()
+ */
+template <typename Scalar>
+struct scalar_zeta_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x, const Scalar& q) const {
+    using numext::zeta;
+    return zeta(x, q);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x, const Packet& q) const {
+    return internal::pzeta(x, q);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_zeta_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasZeta
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the polygamma function.
+ * \sa class CwiseUnaryOp, Cwise::polygamma()
+ */
+template <typename Scalar>
+struct scalar_polygamma_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& n, const Scalar& x) const {
+    using numext::polygamma;
+    return polygamma(n, x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& n, const Packet& x) const {
+    return internal::ppolygamma(n, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_polygamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasPolygamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the error function of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::erf()
+ */
+template <typename Scalar>
+struct scalar_erf_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::erf(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return perf(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasErf,
+    Cost = (PacketAccess
+#ifdef EIGEN_VECTORIZE_FMA
+                // TODO(rmlarsen): Move the FMA cost model to a central location.
+                // Haswell can issue 2 add/mul/madd per cycle.
+                // 10 pmadd, 2 pmul, 1 div, 2 other
+                ? (2 * NumTraits<Scalar>::AddCost + 7 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#else
+                ? (12 * NumTraits<Scalar>::AddCost + 12 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#endif
+                // Assume for simplicity that this is as expensive as an exp().
+                : (functor_traits<scalar_exp_op<Scalar> >::Cost))
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Complementary Error Function
+ * of a scalar
+ * \sa class CwiseUnaryOp, Cwise::erfc()
+ */
+template <typename Scalar>
+struct scalar_erfc_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    using numext::erfc;
+    return erfc(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErfc
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Inverse of the normal distribution
+ * function of a scalar
+ * \sa class CwiseUnaryOp, Cwise::ndtri()
+ */
+template <typename Scalar>
+struct scalar_ndtri_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    using numext::ndtri;
+    return ndtri(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pndtri(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_ndtri_op<Scalar> > {
+  enum {
+    // On average, We are evaluating rational functions with degree N=9 in the
+    // numerator and denominator. This results in 2*N additions and 2*N
+    // multiplications.
+    Cost = 18 * NumTraits<Scalar>::MulCost + 18 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNdtri
+  };
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
new file mode 100644
index 00000000000..baba848145c
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -0,0 +1,73 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H
+#define EIGEN_SPECIALFUNCTIONS_HALF_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+  return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ndtri(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::ndtri(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma_der_a(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half gamma_sample_der_alpha(const Eigen::half& alpha,
+                                                                         const Eigen::half& sample) {
+  return Eigen::half(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b,
+                                                          const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_HALF_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
new file mode 100644
index 00000000000..0b266f96e2d
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -0,0 +1,2030 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIAL_FUNCTIONS_H
+#define EIGEN_SPECIAL_FUNCTIONS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+/****************************************************************************
+ * Implementation of lgamma, requires C++11/C99                             *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct lgamma_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+};
+
+template <typename Scalar>
+struct lgamma_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+// Since glibc 2.19
+#if defined(__GLIBC__) && ((__GLIBC__ >= 2 && __GLIBC_MINOR__ >= 19) || __GLIBC__ > 2) && \
+    (defined(_DEFAULT_SOURCE) || defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
+// Glibc versions before 2.19
+#if defined(__GLIBC__) && ((__GLIBC__ == 2 && __GLIBC_MINOR__ < 19) || __GLIBC__ < 2) && \
+    (defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
+template <>
+struct lgamma_impl<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+    int dummy;
+    return ::lgammaf_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::lgamma(x);
+#else
+    return ::lgammaf(x);
+#endif
+  }
+};
+
+template <>
+struct lgamma_impl<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+    int dummy;
+    return ::lgamma_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::lgamma(x);
+#else
+    return ::lgamma(x);
+#endif
+  }
+};
+
+#undef EIGEN_HAS_LGAMMA_R
+#endif
+
+/****************************************************************************
+ * Implementation of digamma (psi), based on Cephes                         *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct digamma_retval {
+  typedef Scalar type;
+};
+
+/*
+ *
+ * Polynomial evaluation helper for the Psi (digamma) function.
+ *
+ * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for
+ * input Scalar s, assuming s is above 10.0.
+ *
+ * If s is above a certain threshold for the given Scalar type, zero
+ * is returned.  Otherwise the polynomial is evaluated with enough
+ * coefficients for results matching Scalar machine precision.
+ *
+ *
+ */
+template <typename Scalar>
+struct digamma_impl_maybe_poly {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+};
+
+template <>
+struct digamma_impl_maybe_poly<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float s) {
+    constexpr float A[] = {-4.16666666666666666667E-3f, 3.96825396825396825397E-3f, -8.33333333333333333333E-3f,
+                           8.33333333333333333333E-2f};
+
+    float z;
+    if (s < 1.0e8f) {
+      z = 1.0f / (s * s);
+      return z * internal::ppolevl<float, 3>::run(z, A);
+    } else
+      return 0.0f;
+  }
+};
+
+template <>
+struct digamma_impl_maybe_poly<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double s) {
+    constexpr double A[] = {8.33333333333333333333E-2,  -2.10927960927960927961E-2, 7.57575757575757575758E-3,
+                            -4.16666666666666666667E-3, 3.96825396825396825397E-3,  -8.33333333333333333333E-3,
+                            8.33333333333333333333E-2};
+
+    double z;
+    if (s < 1.0e17) {
+      z = 1.0 / (s * s);
+      return z * internal::ppolevl<double, 6>::run(z, A);
+    } else
+      return 0.0;
+  }
+};
+
+template <typename Scalar>
+struct digamma_impl {
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar x) {
+    /*
+     *
+     *     Psi (digamma) function (modified for Eigen)
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, psi();
+     *
+     * y = psi( x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     *              d      -
+     *   psi(x)  =  -- ln | (x)
+     *              dx
+     *
+     * is the logarithmic derivative of the gamma function.
+     * For integer x,
+     *                   n-1
+     *                    -
+     * psi(n) = -EUL  +   >  1/k.
+     *                    -
+     *                   k=1
+     *
+     * If x is negative, it is transformed to a positive argument by the
+     * reflection formula  psi(1-x) = psi(x) + pi cot(pi x).
+     * For general positive x, the argument is made greater than 10
+     * using the recurrence  psi(x+1) = psi(x) + 1/x.
+     * Then the following asymptotic expansion is applied:
+     *
+     *                           inf.   B
+     *                            -      2k
+     * psi(x) = log(x) - 1/2x -   >   -------
+     *                            -        2k
+     *                           k=1   2k x
+     *
+     * where the B2k are Bernoulli numbers.
+     *
+     * ACCURACY (float):
+     *    Relative error (except absolute when |psi| < 1):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       1.3e-15     1.4e-16
+     *    IEEE      -30,0       40000       1.5e-15     2.2e-16
+     *
+     * ACCURACY (double):
+     *    Absolute error,  relative when |psi| > 1 :
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      -33,0        30000      8.2e-7      1.2e-7
+     *    IEEE      0,33        100000      7.3e-7      7.7e-8
+     *
+     * ERROR MESSAGES:
+     *     message         condition      value returned
+     * psi singularity    x integer <=0      INFINITY
+     */
+
+    Scalar p, q, nz, s, w, y;
+    bool negative = false;
+
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+    const Scalar m_pi = Scalar(EIGEN_PI);
+
+    const Scalar zero = Scalar(0);
+    const Scalar one = Scalar(1);
+    const Scalar half = Scalar(0.5);
+    nz = zero;
+
+    if (x <= zero) {
+      negative = true;
+      q = x;
+      p = numext::floor(q);
+      if (p == q) {
+        return nan;
+      }
+      /* Remove the zeros of tan(m_pi x)
+       * by subtracting the nearest integer from x
+       */
+      nz = q - p;
+      if (nz != half) {
+        if (nz > half) {
+          p += one;
+          nz = q - p;
+        }
+        nz = m_pi / numext::tan(m_pi * nz);
+      } else {
+        nz = zero;
+      }
+      x = one - x;
+    }
+
+    /* use the recurrence psi(x+1) = psi(x) + 1/x. */
+    s = x;
+    w = zero;
+    while (s < Scalar(10)) {
+      w += one / s;
+      s += one;
+    }
+
+    y = digamma_impl_maybe_poly<Scalar>::run(s);
+
+    y = numext::log(s) - (half / s) - y - w;
+
+    return (negative) ? y - nz : y;
+  }
+};
+
+/****************************************************************************
+ * Implementation of erf, requires C++11/C99                                *
+ ****************************************************************************/
+
+/** \internal \returns the error function of \a a (coeff-wise)
+    This uses a 11/10-degree rational interpolantand is accurate to 3 ulp for
+    normalized floats.
+
+    This implementation works on both scalars and SIMD "packets".
+*/
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) {
+  // The monomial coefficients of the numerator polynomial (odd).
+  constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f,
+                             3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f,
+                             1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f};
+
+  // The monomial coefficients of the denominator polynomial (even).
+  constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f,
+                            1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f,
+                            4.99425798654556274414062500000e-01f, 1.0f};
+
+  // Since the polynomials are odd/even, we need x^2.
+  // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid
+  // computing Inf/Inf below.
+  const T x2 = pmin(pset1<T>(16.0f), pmul(x, x));
+
+  // Evaluate the numerator polynomial p.
+  T p = ppolevl<T, 5>::run(x2, alpha);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial p.
+  T q = ppolevl<T, 5>::run(x2, beta);
+  const T r = pdiv(p, q);
+
+  // Clamp to [-1:1].
+  return pmax(pmin(r, pset1<T>(1.0f)), pset1<T>(-1.0f));
+}
+
+template <typename T>
+struct erf_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erf_float(x); }
+};
+
+template <typename Scalar>
+struct erf_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erf_impl<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return generic_fast_erf_float(x);
+#endif
+  }
+};
+
+template <>
+struct erf_impl<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return ::erf(x);
+#endif
+  }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+/***************************************************************************
+ * Implementation of erfc, requires C++11/C99                               *
+ ****************************************************************************/
+template <typename Scalar>
+struct generic_fast_erfc {
+  template <typename T>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in);
+};
+
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<float>::run(const T& x_in) {
+  constexpr float kClamp = 11.0f;
+  const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
+
+  // erfc(x) = 1 + x * S(x^2), |x| <= 1.
+  //
+  // Coefficients for S and T generated with Rminimax command:
+  // ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[11,0] --num="odd"
+  //   --numF="[SG]" --denF="[SG]" --log --dispCoeff="dec"
+  constexpr float alpha[] = {5.61802298761904239654541015625e-04, -4.91381669417023658752441406250e-03,
+                             2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01,
+                             3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00};
+  const T x2 = pmul(x, x);
+  const T one = pset1<T>(1.0);
+  const T erfc_small = pmadd(x, ppolevl<T, 5>::run(x2, alpha), one);
+
+  // Return early if we don't need the more expensive approximation for any
+  // entry in a.
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+  if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
+
+  // erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 9.
+  //
+  // Coefficients for P and Q generated with Rminimax command:
+  //   ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)"
+  //     --dom='[0.01,1]' --type=[3,4] --numF="[SG]" --denF="[SG]" --log
+  //     --dispCoeff="dec"
+  constexpr float gamma[] = {1.0208116471767425537109375e-01f, 4.2920666933059692382812500e-01f,
+                             3.2379078865051269531250000e-01f, 5.3971976041793823242187500e-02f};
+  constexpr float delta[] = {1.7251677811145782470703125e-02f, 3.9137163758277893066406250e-01f,
+                             1.0000000000000000000000000e+00f, 6.2173241376876831054687500e-01f,
+                             9.5662862062454223632812500e-02f};
+  const T x2_lo = twoprod_low(x, x, x2);
+  // Here we use that
+  //   exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
+  // since x2_lo < kClamp * eps << 1 in the region we care about. This trick reduces the max error
+  // from 34 ulps to below 5 ulps.
+  const T exp2_hi = pexp(pnegate(x2));
+  const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
+  const T q2 = preciprocal(x2);
+  const T num = ppolevl<T, 3>::run(q2, gamma);
+  const T denom = pmul(x, ppolevl<T, 4>::run(q2, delta));
+  const T r = pdiv(num, denom);
+  const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
+  const T erfc_large = pmadd(z, r, maybe_two);
+  return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
+}
+
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) {
+  // Clamp x to [-27:27] beyond which erfc(x) is either two or zero (below the underflow threshold).
+  // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x.
+  constexpr double kClamp = 28.0;
+  const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
+
+  // erfc(x) = 1 + x * S(x^2) / T(x^2), |x| <= 1.
+  //
+  // Coefficients for S and T generated with Rminimax command:
+  //  ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[9,10]
+  //  --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec"
+  constexpr double alpha[] = {-1.9493725660006057018823477644531294572516344487667083740234375e-04,
+                              -1.8272566210022942682217328425053892715368419885635375976562500e-03,
+                              -4.5303363351690106863856044583371840417385101318359375000000000e-02,
+                              -1.4215015503619179981775744181504705920815467834472656250000000e-01,
+                              -1.1283791670955125585606992899556644260883331298828125000000000e+00};
+  constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05,
+                             6.8117805899186819641732970609382391558028757572174072265625000e-04,
+                             1.0582026056098614921752165685120417037978768348693847656250000e-02,
+                             9.3252603143757495374188692949246615171432495117187500000000000e-02,
+                             4.5931062818368939559832142549566924571990966796875000000000000e-01,
+                             1.0};
+  const T x2 = pmul(x, x);
+  const T num_small = ppolevl<T, 4>::run(x2, alpha);
+  const T denom_small = ppolevl<T, 5>::run(x2, beta);
+  const T one = pset1<T>(1.0);
+  const T erfc_small = pmadd(x, pdiv(num_small, denom_small), one);
+
+  // Return early if we don't need the more expensive approximation for any
+  // entry in a.
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+  if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
+
+  // erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 27.
+  //
+  // Coefficients for P and Q generated with Rminimax command:
+  //  ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)"  --dom='[0.0013717,1]' --type=[9,9] --numF="[D]"
+  //  --denF="[D]" --log --dispCoeff="dec"
+  constexpr double gamma[] = {1.5252844933226974316088642158462107545346952974796295166015625e-04,
+                              1.0909912393738931124520519233556115068495273590087890625000000e-02,
+                              1.0628604636755033252537572252549580298364162445068359375000000e-01,
+                              3.3492472973137982217295416376146022230386734008789062500000000e-01,
+                              4.5065776215933289750026347064704168587923049926757812500000000e-01,
+                              2.9433039130294824659017649537418037652969360351562500000000000e-01,
+                              9.8792676360600226170838311645638896152377128601074218750000000e-02,
+                              1.7095935395503719655962981960328761488199234008789062500000000e-02,
+                              1.4249109729504577659398023570247460156679153442382812500000000e-03,
+                              4.4567378313647954771875570045835956989321857690811157226562500e-05};
+  constexpr double delta[] = {2.041985103115789845773520028160419315099716186523437500000000e-03,
+                              5.316030659946043707142493417450168635696172714233398437500000e-02,
+                              3.426242193784684864077405563875799998641014099121093750000000e-01,
+                              8.565637124308049799026321124983951449394226074218750000000000e-01,
+                              1.000000000000000000000000000000000000000000000000000000000000e+00,
+                              5.968805280570776972126623149961233139038085937500000000000000e-01,
+                              1.890922854723317836356244470152887515723705291748046875000000e-01,
+                              3.152505418656005586885981983868987299501895904541015625000000e-02,
+                              2.565085751861882583380047861965067568235099315643310546875000e-03,
+                              7.899362131678837697403017248376499992446042597293853759765625e-05};
+
+  const T x2_lo = twoprod_low(x, x, x2);
+  // Here we use that
+  //   exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
+  // since x2_lo < kClamp *eps << 1 in the region we care about. This trick reduces the max error
+  // from 258 ulps to below 7 ulps.
+  const T exp2_hi = pexp(pnegate(x2));
+  const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
+  const T q2 = preciprocal(x2);
+  const T num_large = ppolevl<T, 9>::run(q2, gamma);
+  const T denom_large = pmul(x, ppolevl<T, 9>::run(q2, delta));
+  const T r = pdiv(num_large, denom_large);
+  const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
+  const T erfc_large = pmadd(z, r, maybe_two);
+  return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
+}
+
+template <typename T>
+struct erfc_impl {
+  typedef typename unpacket_traits<T>::type Scalar;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erfc<Scalar>::run(x); }
+};
+
+template <typename Scalar>
+struct erfc_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erfc_impl<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erfc(x);
+#else
+    return generic_fast_erfc<float>::run(x);
+#endif
+  }
+};
+
+template <>
+struct erfc_impl<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erfc(x);
+#else
+    return generic_fast_erfc<double>::run(x);
+#endif
+  }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+/***************************************************************************
+ * Implementation of ndtri.                                                 *
+ ****************************************************************************/
+
+/* Inverse of Normal distribution function (modified for Eigen).
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, ndtri();
+ *
+ * x = ndtri( y );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns the argument, x, for which the area under the
+ * Gaussian probability density function (integrated from
+ * minus infinity to x) is equal to y.
+ *
+ *
+ * For small arguments 0 < y < exp(-2), the program computes
+ * z = sqrt( -2.0 * log(y) );  then the approximation is
+ * x = z - log(z)/z  - (1/z) P(1/z) / Q(1/z).
+ * There are two rational functions P/Q, one for 0 < y < exp(-32)
+ * and the other for y up to exp(-2).  For larger arguments,
+ * w = y - 0.5, and  x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
+ *
+ *
+ * ACCURACY:
+ *
+ *                      Relative error:
+ * arithmetic   domain        # trials      peak         rms
+ *    DEC      0.125, 1         5500       9.5e-17     2.1e-17
+ *    DEC      6e-39, 0.135     3500       5.7e-17     1.3e-17
+ *    IEEE     0.125, 1        20000       7.2e-16     1.3e-16
+ *    IEEE     3e-308, 0.135   50000       4.6e-16     9.8e-17
+ *
+ *
+ * ERROR MESSAGES:
+ *
+ *   message         condition    value returned
+ * ndtri domain       x == 0        -INF
+ * ndtri domain       x == 1         INF
+ * ndtri domain       x < 0, x > 1   NAN
+ */
+/*
+  Cephes Math Library Release 2.2: June, 1992
+  Copyright 1985, 1987, 1992 by Stephen L. Moshier
+  Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+*/
+
+// TODO: Add a cheaper approximation for float.
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign(const T& should_flipsign, const T& x) {
+  typedef typename unpacket_traits<T>::type Scalar;
+  const T sign_mask = pset1<T>(Scalar(-0.0));
+  T sign_bit = pand<T>(should_flipsign, sign_mask);
+  return pxor<T>(sign_bit, x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double flipsign<double>(const double& should_flipsign, const double& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(const float& should_flipsign, const float& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+// We split this computation in to two so that in the scalar path
+// only one branch is evaluated (due to our template specialization of pselect
+// being an if statement.)
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
+  const ScalarType p0[] = {ScalarType(-5.99633501014107895267e1), ScalarType(9.80010754185999661536e1),
+                           ScalarType(-5.66762857469070293439e1), ScalarType(1.39312609387279679503e1),
+                           ScalarType(-1.23916583867381258016e0)};
+  const ScalarType q0[] = {ScalarType(1.0),
+                           ScalarType(1.95448858338141759834e0),
+                           ScalarType(4.67627912898881538453e0),
+                           ScalarType(8.63602421390890590575e1),
+                           ScalarType(-2.25462687854119370527e2),
+                           ScalarType(2.00260212380060660359e2),
+                           ScalarType(-8.20372256168333339912e1),
+                           ScalarType(1.59056225126211695515e1),
+                           ScalarType(-1.18331621121330003142e0)};
+  const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
+  const T half = pset1<T>(ScalarType(0.5));
+  T c, c2, ndtri_gt_exp_neg_two;
+
+  c = psub(b, half);
+  c2 = pmul(c, c);
+  ndtri_gt_exp_neg_two =
+      pmadd(c, pmul(c2, pdiv(internal::ppolevl<T, 4>::run(c2, p0), internal::ppolevl<T, 8>::run(c2, q0))), c);
+  return pmul(ndtri_gt_exp_neg_two, sqrt2pi);
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(const T& b, const T& should_flipsign) {
+  /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
+   * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
+   */
+  const ScalarType p1[] = {ScalarType(4.05544892305962419923e0),   ScalarType(3.15251094599893866154e1),
+                           ScalarType(5.71628192246421288162e1),   ScalarType(4.40805073893200834700e1),
+                           ScalarType(1.46849561928858024014e1),   ScalarType(2.18663306850790267539e0),
+                           ScalarType(-1.40256079171354495875e-1), ScalarType(-3.50424626827848203418e-2),
+                           ScalarType(-8.57456785154685413611e-4)};
+  const ScalarType q1[] = {ScalarType(1.0),
+                           ScalarType(1.57799883256466749731e1),
+                           ScalarType(4.53907635128879210584e1),
+                           ScalarType(4.13172038254672030440e1),
+                           ScalarType(1.50425385692907503408e1),
+                           ScalarType(2.50464946208309415979e0),
+                           ScalarType(-1.42182922854787788574e-1),
+                           ScalarType(-3.80806407691578277194e-2),
+                           ScalarType(-9.33259480895457427372e-4)};
+  /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
+   * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+   */
+  const ScalarType p2[] = {ScalarType(3.23774891776946035970e0),  ScalarType(6.91522889068984211695e0),
+                           ScalarType(3.93881025292474443415e0),  ScalarType(1.33303460815807542389e0),
+                           ScalarType(2.01485389549179081538e-1), ScalarType(1.23716634817820021358e-2),
+                           ScalarType(3.01581553508235416007e-4), ScalarType(2.65806974686737550832e-6),
+                           ScalarType(6.23974539184983293730e-9)};
+  const ScalarType q2[] = {ScalarType(1.0),
+                           ScalarType(6.02427039364742014255e0),
+                           ScalarType(3.67983563856160859403e0),
+                           ScalarType(1.37702099489081330271e0),
+                           ScalarType(2.16236993594496635890e-1),
+                           ScalarType(1.34204006088543189037e-2),
+                           ScalarType(3.28014464682127739104e-4),
+                           ScalarType(2.89247864745380683936e-6),
+                           ScalarType(6.79019408009981274425e-9)};
+  const T eight = pset1<T>(ScalarType(8.0));
+  const T neg_two = pset1<T>(ScalarType(-2));
+  T x, x0, x1, z;
+
+  x = psqrt(pmul(neg_two, plog(b)));
+  x0 = psub(x, pdiv(plog(x), x));
+  z = preciprocal(x);
+  x1 =
+      pmul(z, pselect(pcmp_lt(x, eight), pdiv(internal::ppolevl<T, 8>::run(z, p1), internal::ppolevl<T, 8>::run(z, q1)),
+                      pdiv(internal::ppolevl<T, 8>::run(z, p2), internal::ppolevl<T, 8>::run(z, q2))));
+  return flipsign(should_flipsign, psub(x0, x1));
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T generic_ndtri(const T& a) {
+  const T maxnum = pset1<T>(NumTraits<ScalarType>::infinity());
+  const T neg_maxnum = pset1<T>(-NumTraits<ScalarType>::infinity());
+
+  const T zero = pset1<T>(ScalarType(0));
+  const T one = pset1<T>(ScalarType(1));
+  // exp(-2)
+  const T exp_neg_two = pset1<T>(ScalarType(0.13533528323661269189));
+  T b, ndtri, should_flipsign;
+
+  should_flipsign = pcmp_le(a, psub(one, exp_neg_two));
+  b = pselect(should_flipsign, a, psub(one, a));
+
+  ndtri = pselect(pcmp_lt(exp_neg_two, b), generic_ndtri_gt_exp_neg_two<T, ScalarType>(b),
+                  generic_ndtri_lt_exp_neg_two<T, ScalarType>(b, should_flipsign));
+
+  return pselect(pcmp_eq(a, zero), neg_maxnum, pselect(pcmp_eq(one, a), maxnum, ndtri));
+}
+
+template <typename Scalar>
+struct ndtri_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+};
+
+#else
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar x) { return generic_ndtri<Scalar, Scalar>(x); }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/**************************************************************************************************************
+ * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ **************************************************************************************************************/
+
+template <typename Scalar>
+struct igammac_retval {
+  typedef Scalar type;
+};
+
+// NOTE: cephes_helper is also used to implement zeta
+template <typename Scalar>
+struct cephes_helper {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar machep() {
+    eigen_assert(false && "machep not supported for this type");
+    return 0.0;
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar big() {
+    eigen_assert(false && "big not supported for this type");
+    return 0.0;
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar biginv() {
+    eigen_assert(false && "biginv not supported for this type");
+    return 0.0;
+  }
+};
+
+template <>
+struct cephes_helper<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float machep() {
+    return NumTraits<float>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float big() {
+    // use epsneg (1.0 - epsneg == 1.0)
+    return 1.0f / (NumTraits<float>::epsilon() / 2);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float biginv() {
+    // epsneg
+    return machep();
+  }
+};
+
+template <>
+struct cephes_helper<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double machep() {
+    return NumTraits<double>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double big() { return 1.0 / NumTraits<double>::epsilon(); }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double biginv() {
+    // inverse of eps
+    return NumTraits<double>::epsilon();
+  }
+};
+
+enum IgammaComputationMode { VALUE, DERIVATIVE, SAMPLE_DERIVATIVE };
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar main_igamma_term(Scalar a, Scalar x) {
+  /* Compute  x**a * exp(-x) / gamma(a)  */
+  Scalar logax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+  if (logax < -numext::log(NumTraits<Scalar>::highest()) ||
+      // Assuming x and a aren't Nan.
+      (numext::isnan)(logax)) {
+    return Scalar(0);
+  }
+  return numext::exp(logax);
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+EIGEN_DEVICE_FUNC int igamma_num_iterations() {
+  /* Returns the maximum number of internal iterations for igamma computation.
+   */
+  if (mode == VALUE) {
+    return 2000;
+  }
+
+  if (internal::is_same<Scalar, float>::value) {
+    return 200;
+  } else if (internal::is_same<Scalar, double>::value) {
+    return 500;
+  } else {
+    return 2000;
+  }
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igammac_cf_impl {
+  /* Computes igamc(a, x) or derivative (depending on the mode)
+   * using the continued fraction expansion of the complementary
+   * incomplete Gamma function.
+   *
+   * Preconditions:
+   *   a > 0
+   *   x >= 1
+   *   x >= a
+   */
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    if ((numext::isinf)(x)) {
+      return zero;
+    }
+
+    Scalar ax = main_igamma_term<Scalar>(a, x);
+    // This is independent of mode. If this value is zero,
+    // then the function value is zero. If the function value is zero,
+    // then we are in a neighborhood where the function value evaluates to zero,
+    // so the derivative is zero.
+    if (ax == zero) {
+      return zero;
+    }
+
+    // continued fraction
+    Scalar y = one - a;
+    Scalar z = x + y + one;
+    Scalar c = zero;
+    Scalar pkm2 = one;
+    Scalar qkm2 = x;
+    Scalar pkm1 = x + one;
+    Scalar qkm1 = z * x;
+    Scalar ans = pkm1 / qkm1;
+
+    Scalar dpkm2_da = zero;
+    Scalar dqkm2_da = zero;
+    Scalar dpkm1_da = zero;
+    Scalar dqkm1_da = -x;
+    Scalar dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      c += one;
+      y += one;
+      z += two;
+
+      Scalar yc = y * c;
+      Scalar pk = pkm1 * z - pkm2 * yc;
+      Scalar qk = qkm1 * z - qkm2 * yc;
+
+      Scalar dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c;
+      Scalar dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c;
+
+      if (qk != zero) {
+        Scalar ans_prev = ans;
+        ans = pk / qk;
+
+        Scalar dans_da_prev = dans_da;
+        dans_da = (dpk_da - ans * dqk_da) / qk;
+
+        if (mode == VALUE) {
+          if (numext::abs(ans_prev - ans) <= machep * numext::abs(ans)) {
+            break;
+          }
+        } else {
+          if (numext::abs(dans_da - dans_da_prev) <= machep) {
+            break;
+          }
+        }
+      }
+
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      dpkm2_da = dpkm1_da;
+      dpkm1_da = dpk_da;
+      dqkm2_da = dqkm1_da;
+      dqkm1_da = dqk_da;
+
+      if (numext::abs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+
+        dpkm2_da *= biginv;
+        dpkm1_da *= biginv;
+        dqkm2_da *= biginv;
+        dqkm1_da *= biginv;
+      }
+    }
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default:  // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x;
+    }
+  }
+};
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_series_impl {
+  /* Computes igam(a, x) or its derivative (depending on the mode)
+   * using the series expansion of the incomplete Gamma function.
+   *
+   * Preconditions:
+   *   x > 0
+   *   a > 0
+   *   !(x > 1 && x > a)
+   */
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+
+    Scalar ax = main_igamma_term<Scalar>(a, x);
+
+    // This is independent of mode. If this value is zero,
+    // then the function value is zero. If the function value is zero,
+    // then we are in a neighborhood where the function value evaluates to zero,
+    // so the derivative is zero.
+    if (ax == zero) {
+      return zero;
+    }
+
+    ax /= a;
+
+    /* power series */
+    Scalar r = a;
+    Scalar c = one;
+    Scalar ans = one;
+
+    Scalar dc_da = zero;
+    Scalar dans_da = zero;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      r += one;
+      Scalar term = x / r;
+      Scalar dterm_da = -x / (r * r);
+      dc_da = term * dc_da + dterm_da * c;
+      dans_da += dc_da;
+      c *= term;
+      ans += c;
+
+      if (mode == VALUE) {
+        if (c <= machep * ans) {
+          break;
+        }
+      } else {
+        if (numext::abs(dc_da) <= machep * numext::abs(dans_da)) {
+          break;
+        }
+      }
+    }
+
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a + one);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default:  // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x / a;
+    }
+  }
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) { return Scalar(0); }
+};
+
+#else
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) {
+    /*  igamc()
+     *
+     *	Incomplete gamma integral (modified for Eigen)
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double a, x, y, igamc();
+     *
+     * y = igamc( a, x );
+     *
+     * DESCRIPTION:
+     *
+     * The function is defined by
+     *
+     *
+     *  igamc(a,x)   =   1 - igam(a,x)
+     *
+     *                            inf.
+     *                              -
+     *                     1       | |  -t  a-1
+     *               =   -----     |   e   t   dt.
+     *                    -      | |
+     *                   | (a)    -
+     *                             x
+     *
+     *
+     * In this implementation both arguments must be positive.
+     * The integral is evaluated by either a power series or
+     * continued fraction expansion, depending on the relative
+     * values of a and x.
+     *
+     * ACCURACY (float):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       7.8e-6      5.9e-7
+     *
+     *
+     * ACCURACY (double):
+     *
+     * Tested at random a, x.
+     *                a         x                      Relative error:
+     * arithmetic   domain   domain     # trials      peak         rms
+     *    IEEE     0.5,100   0,100      200000       1.9e-14     1.7e-15
+     *    IEEE     0.01,0.5  0,100      200000       1.4e-13     1.6e-15
+     *
+     */
+    /*
+      Cephes Math Library Release 2.2: June, 1992
+      Copyright 1985, 1987, 1992 by Stephen L. Moshier
+      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+    */
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if ((x < zero) || (a <= zero)) {
+      // domain error
+      return nan;
+    }
+
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
+    }
+
+    if ((x < one) || (x < a)) {
+      return (one - igamma_series_impl<Scalar, VALUE>::run(a, x));
+    }
+
+    return igammac_cf_impl<Scalar, VALUE>::run(a, x);
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) { return Scalar(0); }
+};
+
+#else
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) {
+    /* Depending on the mode, returns
+     * - VALUE: incomplete Gamma function igamma(a, x)
+     * - DERIVATIVE: derivative of incomplete Gamma function d/da igamma(a, x)
+     * - SAMPLE_DERIVATIVE: implicit derivative of a Gamma random variable
+     * x ~ Gamma(x | a, 1), dx/da = -1 / Gamma(x | a, 1) * d igamma(a, x) / dx
+     *
+     * Derivatives are implemented by forward-mode differentiation.
+     */
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if (x == zero) return zero;
+
+    if ((x < zero) || (a <= zero)) {  // domain error
+      return nan;
+    }
+
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
+    }
+
+    if ((x > one) && (x > a)) {
+      Scalar ret = igammac_cf_impl<Scalar, mode>::run(a, x);
+      if (mode == VALUE) {
+        return one - ret;
+      } else {
+        return -ret;
+      }
+    }
+
+    return igamma_series_impl<Scalar, mode>::run(a, x);
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igamma_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+struct igamma_impl : igamma_generic_impl<Scalar, VALUE> {
+  /* igam()
+   * Incomplete gamma integral.
+   *
+   * The CDF of Gamma(a, 1) random variable at the point x.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 1.26713e-05
+   * double: 2.33606e-12
+   *
+   * Cephes documentation below.
+   *
+   * SYNOPSIS:
+   *
+   * double a, x, y, igam();
+   *
+   * y = igam( a, x );
+   *
+   * DESCRIPTION:
+   *
+   * The function is defined by
+   *
+   *                           x
+   *                            -
+   *                   1       | |  -t  a-1
+   *  igam(a,x)  =   -----     |   e   t   dt.
+   *                  -      | |
+   *                 | (a)    -
+   *                           0
+   *
+   *
+   * In this implementation both arguments must be positive.
+   * The integral is evaluated by either a power series or
+   * continued fraction expansion, depending on the relative
+   * values of a and x.
+   *
+   * ACCURACY (double):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30       200000       3.6e-14     2.9e-15
+   *    IEEE      0,100      300000       9.9e-14     1.5e-14
+   *
+   *
+   * ACCURACY (float):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30        20000       7.8e-6      5.9e-7
+   *
+   */
+  /*
+    Cephes Math Library Release 2.2: June, 1992
+    Copyright 1985, 1987, 1992 by Stephen L. Moshier
+    Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+  */
+
+  /* left tail of incomplete gamma function:
+   *
+   *          inf.      k
+   *   a  -x   -       x
+   *  x  e     >   ----------
+   *           -     -
+   *          k=0   | (a+k+1)
+   *
+   */
+};
+
+template <typename Scalar>
+struct igamma_der_a_retval : igamma_retval<Scalar> {};
+
+template <typename Scalar>
+struct igamma_der_a_impl : igamma_generic_impl<Scalar, DERIVATIVE> {
+  /* Derivative of the incomplete Gamma function with respect to a.
+   *
+   * Computes d/da igamma(a, x) by forward differentiation of the igamma code.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 6.17992e-07
+   * double: 4.60453e-12
+   *
+   * Reference:
+   * R. Moore. "Algorithm AS 187: Derivatives of the incomplete gamma
+   * integral". Journal of the Royal Statistical Society. 1982
+   */
+};
+
+template <typename Scalar>
+struct gamma_sample_der_alpha_retval : igamma_retval<Scalar> {};
+
+template <typename Scalar>
+struct gamma_sample_der_alpha_impl : igamma_generic_impl<Scalar, SAMPLE_DERIVATIVE> {
+  /* Derivative of a Gamma random variable sample with respect to alpha.
+   *
+   * Consider a sample of a Gamma random variable with the concentration
+   * parameter alpha: sample ~ Gamma(alpha, 1). The reparameterization
+   * derivative that we want to compute is dsample / dalpha =
+   * d igammainv(alpha, u) / dalpha, where u = igamma(alpha, sample).
+   * However, this formula is numerically unstable and expensive, so instead
+   * we use implicit differentiation:
+   *
+   * igamma(alpha, sample) = u, where u ~ Uniform(0, 1).
+   * Apply d / dalpha to both sides:
+   * d igamma(alpha, sample) / dalpha
+   *     + d igamma(alpha, sample) / dsample * dsample/dalpha  = 0
+   * d igamma(alpha, sample) / dalpha
+   *     + Gamma(sample | alpha, 1) dsample / dalpha = 0
+   * dsample/dalpha = - (d igamma(alpha, sample) / dalpha)
+   *                   / Gamma(sample | alpha, 1)
+   *
+   * Here Gamma(sample | alpha, 1) is the PDF of the Gamma distribution
+   * (note that the derivative of the CDF w.r.t. sample is the PDF).
+   * See the reference below for more details.
+   *
+   * The derivative of igamma(alpha, sample) is computed by forward
+   * differentiation of the igamma code. Division by the Gamma PDF is performed
+   * in the same code, increasing the accuracy and speed due to cancellation
+   * of some terms.
+   *
+   * Accuracy estimation. For each alpha in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables sample ~ Gamma(sample | alpha, 1), a total of 300
+   * points. The ground truth is computed by mpmath. Mean absolute error:
+   * float: 2.1686e-06
+   * double: 1.4774e-12
+   *
+   * Reference:
+   * M. Figurnov, S. Mohamed, A. Mnih "Implicit Reparameterization Gradients".
+   * 2018
+   */
+};
+
+/*****************************************************************************
+ * Implementation of Riemann zeta function of two arguments, based on Cephes *
+ *****************************************************************************/
+
+template <typename Scalar>
+struct zeta_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+struct zeta_impl_series {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+};
+
+template <>
+struct zeta_impl_series<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x,
+                                                        const float machep) {
+    int i = 0;
+    while (i < 9) {
+      i += 1;
+      a += 1.0f;
+      b = numext::pow(a, -x);
+      s += b;
+      if (numext::abs(b / s) < machep) return true;
+    }
+
+    // Return whether we are done
+    return false;
+  }
+};
+
+template <>
+struct zeta_impl_series<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x,
+                                                        const double machep) {
+    int i = 0;
+    while ((i < 9) || (a <= 9.0)) {
+      i += 1;
+      a += 1.0;
+      b = numext::pow(a, -x);
+      s += b;
+      if (numext::abs(b / s) < machep) return true;
+    }
+
+    // Return whether we are done
+    return false;
+  }
+};
+
+template <typename Scalar>
+struct zeta_impl {
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar x, Scalar q) {
+    /*							zeta.c
+     *
+     *	Riemann zeta function of two arguments
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, q, y, zeta();
+     *
+     * y = zeta( x, q );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     *
+     *
+     *                 inf.
+     *                  -        -x
+     *   zeta(x,q)  =   >   (k+q)
+     *                  -
+     *                 k=0
+     *
+     * where x > 1 and q is not a negative integer or zero.
+     * The Euler-Maclaurin summation formula is used to obtain
+     * the expansion
+     *
+     *                n
+     *                -       -x
+     * zeta(x,q)  =   >  (k+q)
+     *                -
+     *               k=1
+     *
+     *           1-x                 inf.  B   x(x+1)...(x+2j)
+     *      (n+q)           1         -     2j
+     *  +  ---------  -  -------  +   >    --------------------
+     *        x-1              x      -                   x+2j+1
+     *                   2(n+q)      j=1       (2j)! (n+q)
+     *
+     * where the B2j are Bernoulli numbers.  Note that (see zetac.c)
+     * zeta(x,1) = zetac(x) + 1.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     * Relative error for single precision:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,25        10000       6.9e-7      1.0e-7
+     *
+     * Large arguments may produce underflow in powf(), in which
+     * case the results are inaccurate.
+     *
+     * REFERENCE:
+     *
+     * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals,
+     * Series, and Products, p. 1073; Academic Press, 1980.
+     *
+     */
+
+    int i;
+    Scalar p, r, a, b, k, s, t, w;
+
+    const Scalar A[] = {
+        Scalar(12.0),
+        Scalar(-720.0),
+        Scalar(30240.0),
+        Scalar(-1209600.0),
+        Scalar(47900160.0),
+        Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/
+        Scalar(7.47242496e10),
+        Scalar(-2.950130727918164224e12),  /*1.067062284288e16/3617*/
+        Scalar(1.1646782814350067249e14),  /*5.109094217170944e18/43867*/
+        Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/
+        Scalar(1.8152105401943546773e17),  /*1.5511210043330985984e23/854513*/
+        Scalar(-7.1661652561756670113e18)  /*1.6938241367317436694528e27/236364091*/
+    };
+
+    const Scalar maxnum = NumTraits<Scalar>::infinity();
+    const Scalar zero = Scalar(0.0), half = Scalar(0.5), one = Scalar(1.0);
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if (x == one) return maxnum;
+
+    if (x < one) {
+      return nan;
+    }
+
+    if (q <= zero) {
+      if (q == numext::floor(q)) {
+        if (x == numext::floor(x) && long(x) % 2 == 0) {
+          return maxnum;
+        } else {
+          return nan;
+        }
+      }
+      p = x;
+      r = numext::floor(p);
+      if (p != r) return nan;
+    }
+
+    /* Permit negative q but continue sum until n+q > +9 .
+     * This case should be handled by a reflection formula.
+     * If q<0 and x is an integer, there is a relation to
+     * the polygamma function.
+     */
+    s = numext::pow(q, -x);
+    a = q;
+    b = zero;
+    // Run the summation in a helper function that is specific to the floating precision
+    if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) {
+      return s;
+    }
+
+    // If b is zero, then the tail sum will also end up being zero.
+    // Exiting early here can prevent NaNs for some large inputs, where
+    // the tail sum computed below has term `a` which can overflow to `inf`.
+    if (numext::equal_strict(b, zero)) {
+      return s;
+    }
+
+    w = a;
+    s += b * w / (x - one);
+    s -= half * b;
+    a = one;
+    k = zero;
+
+    for (i = 0; i < 12; i++) {
+      a *= x + k;
+      b /= w;
+      t = a * b / A[i];
+      s = s + t;
+      t = numext::abs(t / s);
+      if (t < machep) {
+        break;
+      }
+      k += one;
+      a *= x + k;
+      b /= w;
+      k += one;
+    }
+    return s;
+  }
+};
+
+/****************************************************************************
+ * Implementation of polygamma function, requires C++11/C99                 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct polygamma_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct polygamma_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) { return Scalar(0); }
+};
+
+#else
+
+template <typename Scalar>
+struct polygamma_impl {
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar n, Scalar x) {
+    Scalar zero = 0.0, one = 1.0;
+    Scalar nplus = n + one;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    // Check that n is a non-negative integer
+    if (numext::floor(n) != n || n < zero) {
+      return nan;
+    }
+    // Just return the digamma function for n = 0
+    else if (n == zero) {
+      return digamma_impl<Scalar>::run(x);
+    }
+    // Use the same implementation as scipy
+    else {
+      Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus));
+      return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x);
+    }
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct betainc_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) { return Scalar(0); }
+};
+
+#else
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) {
+    /*	betaincf.c
+     *
+     *	Incomplete beta integral
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float a, b, x, y, betaincf();
+     *
+     * y = betaincf( a, b, x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns incomplete beta integral of the arguments, evaluated
+     * from zero to x.  The function is defined as
+     *
+     *                  x
+     *     -            -
+     *    | (a+b)      | |  a-1     b-1
+     *  -----------    |   t   (1-t)   dt.
+     *   -     -     | |
+     *  | (a) | (b)   -
+     *                 0
+     *
+     * The domain of definition is 0 <= x <= 1.  In this
+     * implementation a and b are restricted to positive values.
+     * The integral from x to 1 may be obtained by the symmetry
+     * relation
+     *
+     *    1 - betainc( a, b, x )  =  betainc( b, a, 1-x ).
+     *
+     * The integral is evaluated by a continued fraction expansion.
+     * If a < 1, the function calls itself recursively after a
+     * transformation to increase a to a+1.
+     *
+     * ACCURACY (float):
+     *
+     * Tested at random points (a,b,x) with a and b in the indicated
+     * interval and x between 0 and 1.
+     *
+     * arithmetic   domain     # trials      peak         rms
+     * Relative error:
+     *    IEEE       0,30       10000       3.7e-5      5.1e-6
+     *    IEEE       0,100      10000       1.7e-4      2.5e-5
+     * The useful domain for relative error is limited by underflow
+     * of the single precision exponential function.
+     * Absolute error:
+     *    IEEE       0,30      100000       2.2e-5      9.6e-7
+     *    IEEE       0,100      10000       6.5e-5      3.7e-6
+     *
+     * Larger errors may occur for extreme ratios of a and b.
+     *
+     * ACCURACY (double):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,5         10000       6.9e-15     4.5e-16
+     *    IEEE      0,85       250000       2.2e-13     1.7e-14
+     *    IEEE      0,1000      30000       5.3e-12     6.3e-13
+     *    IEEE      0,10000    250000       9.3e-11     7.1e-12
+     *    IEEE      0,100000    10000       8.7e-10     4.8e-11
+     * Outputs smaller than the IEEE gradual underflow threshold
+     * were excluded from these statistics.
+     *
+     * ERROR MESSAGES:
+     *   message         condition      value returned
+     * incbet domain      x<0, x>1          nan
+     * incbet underflow                     nan
+     */
+    return Scalar(0);
+  }
+};
+
+/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True)
+ * Continued fraction expansion #2 for incomplete beta integral (small_branch = False)
+ */
+template <typename Scalar>
+struct incbeta_cfe {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value || internal::is_same<Scalar, double>::value),
+                      THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) {
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+
+    Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
+    Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update;
+    Scalar ans;
+    int n;
+
+    const int num_iters = (internal::is_same<Scalar, float>::value) ? 100 : 300;
+    const Scalar thresh = (internal::is_same<Scalar, float>::value) ? machep : Scalar(3) * machep;
+    Scalar r = (internal::is_same<Scalar, float>::value) ? zero : one;
+
+    if (small_branch) {
+      k1 = a;
+      k2 = a + b;
+      k3 = a;
+      k4 = a + one;
+      k5 = one;
+      k6 = b - one;
+      k7 = k4;
+      k8 = a + two;
+      k26update = one;
+    } else {
+      k1 = a;
+      k2 = b - one;
+      k3 = a;
+      k4 = a + one;
+      k5 = one;
+      k6 = a + b;
+      k7 = a + one;
+      k8 = a + two;
+      k26update = -one;
+      x = x / (one - x);
+    }
+
+    pkm2 = zero;
+    qkm2 = one;
+    pkm1 = one;
+    qkm1 = one;
+    ans = one;
+    n = 0;
+
+    do {
+      xk = -(x * k1 * k2) / (k3 * k4);
+      pk = pkm1 + pkm2 * xk;
+      qk = qkm1 + qkm2 * xk;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      xk = (x * k5 * k6) / (k7 * k8);
+      pk = pkm1 + pkm2 * xk;
+      qk = qkm1 + qkm2 * xk;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      if (qk != zero) {
+        r = pk / qk;
+        if (numext::abs(ans - r) < numext::abs(r) * thresh) {
+          return r;
+        }
+        ans = r;
+      }
+
+      k1 += one;
+      k2 += k26update;
+      k3 += two;
+      k4 += two;
+      k5 += one;
+      k6 -= k26update;
+      k7 += two;
+      k8 += two;
+
+      if ((numext::abs(qk) + numext::abs(pk)) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+      }
+      if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) {
+        pkm2 *= big;
+        pkm1 *= big;
+        qkm2 *= big;
+        qkm1 *= big;
+      }
+    } while (++n < num_iters);
+
+    return ans;
+  }
+};
+
+/* Helper functions depending on the Scalar type */
+template <typename Scalar>
+struct betainc_helper {};
+
+template <>
+struct betainc_helper<float> {
+  /* Core implementation, assumes a large (> 1.0) */
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb, float xx) {
+    float ans, a, b, t, x, onemx;
+    bool reversed_a_b = false;
+
+    onemx = 1.0f - xx;
+
+    /* see if x is greater than the mean */
+    if (xx > (aa / (aa + bb))) {
+      reversed_a_b = true;
+      a = bb;
+      b = aa;
+      t = xx;
+      x = onemx;
+    } else {
+      a = aa;
+      b = bb;
+      t = onemx;
+      x = xx;
+    }
+
+    /* Choose expansion for optimal convergence */
+    if (b > 10.0f) {
+      if (numext::abs(b * x / a) < 0.3f) {
+        t = betainc_helper<float>::incbps(a, b, x);
+        if (reversed_a_b) t = 1.0f - t;
+        return t;
+      }
+    }
+
+    ans = x * (a + b - 2.0f) / (a - 1.0f);
+    if (ans < 1.0f) {
+      ans = incbeta_cfe<float>::run(a, b, x, true /* small_branch */);
+      t = b * numext::log(t);
+    } else {
+      ans = incbeta_cfe<float>::run(a, b, x, false /* small_branch */);
+      t = (b - 1.0f) * numext::log(t);
+    }
+
+    t += a * numext::log(x) + lgamma_impl<float>::run(a + b) - lgamma_impl<float>::run(a) - lgamma_impl<float>::run(b);
+    t += numext::log(ans / a);
+    t = numext::exp(t);
+
+    if (reversed_a_b) t = 1.0f - t;
+    return t;
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) {
+    float t, u, y, s;
+    const float machep = cephes_helper<float>::machep();
+
+    y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a);
+    y -= lgamma_impl<float>::run(a) + lgamma_impl<float>::run(b);
+    y += lgamma_impl<float>::run(a + b);
+
+    t = x / (1.0f - x);
+    s = 0.0f;
+    u = 1.0f;
+    do {
+      b -= 1.0f;
+      if (b == 0.0f) {
+        break;
+      }
+      a += 1.0f;
+      u *= t * b / a;
+      s += u;
+    } while (numext::abs(u) > machep);
+
+    return numext::exp(y) * (1.0f + s);
+  }
+};
+
+template <>
+struct betainc_impl<float> {
+  EIGEN_DEVICE_FUNC static float run(float a, float b, float x) {
+    const float nan = NumTraits<float>::quiet_NaN();
+    float ans, t;
+
+    if (a <= 0.0f) return nan;
+    if (b <= 0.0f) return nan;
+    if ((x <= 0.0f) || (x >= 1.0f)) {
+      if (x == 0.0f) return 0.0f;
+      if (x == 1.0f) return 1.0f;
+      // mtherr("betaincf", DOMAIN);
+      return nan;
+    }
+
+    /* transformation for small aa */
+    if (a <= 1.0f) {
+      ans = betainc_helper<float>::incbsa(a + 1.0f, b, x);
+      t = a * numext::log(x) + b * numext::log1p(-x) + lgamma_impl<float>::run(a + b) -
+          lgamma_impl<float>::run(a + 1.0f) - lgamma_impl<float>::run(b);
+      return (ans + numext::exp(t));
+    } else {
+      return betainc_helper<float>::incbsa(a, b, x);
+    }
+  }
+};
+
+template <>
+struct betainc_helper<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) {
+    const double machep = cephes_helper<double>::machep();
+
+    double s, t, u, v, n, t1, z, ai;
+
+    ai = 1.0 / a;
+    u = (1.0 - b) * x;
+    v = u / (a + 1.0);
+    t1 = v;
+    t = u;
+    n = 2.0;
+    s = 0.0;
+    z = machep * ai;
+    while (numext::abs(v) > z) {
+      u = (n - b) * x / n;
+      t *= u;
+      v = t / (a + n);
+      s += v;
+      n += 1.0;
+    }
+    s += t1;
+    s += ai;
+
+    u = a * numext::log(x);
+    // TODO: gamma() is not directly implemented in Eigen.
+    /*
+    if ((a + b) < maxgam && numext::abs(u) < maxlog) {
+      t = gamma(a + b) / (gamma(a) * gamma(b));
+      s = s * t * pow(x, a);
+    }
+    */
+    t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) - lgamma_impl<double>::run(b) + u +
+        numext::log(s);
+    return s = numext::exp(t);
+  }
+};
+
+template <>
+struct betainc_impl<double> {
+  EIGEN_DEVICE_FUNC static double run(double aa, double bb, double xx) {
+    const double nan = NumTraits<double>::quiet_NaN();
+    const double machep = cephes_helper<double>::machep();
+    // const double maxgam = 171.624376956302725;
+
+    double a, b, t, x, xc, w, y;
+    bool reversed_a_b = false;
+
+    if (aa <= 0.0 || bb <= 0.0) {
+      return nan;  // goto domerr;
+    }
+
+    if ((xx <= 0.0) || (xx >= 1.0)) {
+      if (xx == 0.0) return (0.0);
+      if (xx == 1.0) return (1.0);
+      // mtherr("incbet", DOMAIN);
+      return nan;
+    }
+
+    if ((bb * xx) <= 1.0 && xx <= 0.95) {
+      return betainc_helper<double>::incbps(aa, bb, xx);
+    }
+
+    w = 1.0 - xx;
+
+    /* Reverse a and b if x is greater than the mean. */
+    if (xx > (aa / (aa + bb))) {
+      reversed_a_b = true;
+      a = bb;
+      b = aa;
+      xc = xx;
+      x = w;
+    } else {
+      a = aa;
+      b = bb;
+      xc = w;
+      x = xx;
+    }
+
+    if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) {
+      t = betainc_helper<double>::incbps(a, b, x);
+      if (t <= machep) {
+        t = 1.0 - machep;
+      } else {
+        t = 1.0 - t;
+      }
+      return t;
+    }
+
+    /* Choose expansion for better convergence. */
+    y = x * (a + b - 2.0) - (a - 1.0);
+    if (y < 0.0) {
+      w = incbeta_cfe<double>::run(a, b, x, true /* small_branch */);
+    } else {
+      w = incbeta_cfe<double>::run(a, b, x, false /* small_branch */) / xc;
+    }
+
+    /* Multiply w by the factor
+         a      b   _             _     _
+        x  (1-x)   | (a+b) / ( a | (a) | (b) ) .   */
+
+    y = a * numext::log(x);
+    t = b * numext::log(xc);
+    // TODO: gamma is not directly implemented in Eigen.
+    /*
+    if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog)
+    {
+      t = pow(xc, b);
+      t *= pow(x, a);
+      t /= a;
+      t *= w;
+      t *= gamma(a + b) / (gamma(a) * gamma(b));
+    } else {
+    */
+    /* Resort to logarithms.  */
+    y += t + lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) - lgamma_impl<double>::run(b);
+    y += numext::log(w / a);
+    t = numext::exp(y);
+
+    /* } */
+    // done:
+
+    if (reversed_a_b) {
+      if (t <= machep) {
+        t = 1.0 - machep;
+      } else {
+        t = 1.0 - t;
+      }
+    }
+    return t;
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+}  // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) lgamma(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar) digamma(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar) zeta(const Scalar& x, const Scalar& q) {
+  return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar) polygamma(const Scalar& n, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) erf(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) erfc(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(ndtri, Scalar) ndtri(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(ndtri, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar) igamma(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma_der_a, Scalar) igamma_der_a(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma_der_a, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(gamma_sample_der_alpha, Scalar)
+    gamma_sample_der_alpha(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(gamma_sample_der_alpha, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar) igammac(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
+    betainc(const Scalar& a, const Scalar& b, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x);
+}
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
new file mode 100644
index 00000000000..d1470db2860
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -0,0 +1,112 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plgamma(const Packet& a) {
+  using numext::lgamma;
+  return lgamma(a);
+}
+
+/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdigamma(const Packet& a) {
+  using numext::digamma;
+  return digamma(a);
+}
+
+/** \internal \returns the zeta function of two arguments (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pzeta(const Packet& x, const Packet& q) {
+  using numext::zeta;
+  return zeta(x, q);
+}
+
+/** \internal \returns the polygamma function (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ppolygamma(const Packet& n, const Packet& x) {
+  using numext::polygamma;
+  return polygamma(n, x);
+}
+
+/** \internal \returns the erf(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perf(const Packet& a) {
+  using numext::erf;
+  return erf(a);
+}
+
+/** \internal \returns the erfc(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perfc(const Packet& a) {
+  using numext::erfc;
+  return erfc(a);
+}
+
+/** \internal \returns the ndtri(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pndtri(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  using internal::generic_ndtri;
+  return generic_ndtri<Packet, ScalarType>(a);
+}
+
+/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma(const Packet& a, const Packet& x) {
+  using numext::igamma;
+  return igamma(a, x);
+}
+
+/** \internal \returns the derivative of the incomplete gamma function
+ * igamma_der_a(\a a, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma_der_a(const Packet& a, const Packet& x) {
+  using numext::igamma_der_a;
+  return igamma_der_a(a, x);
+}
+
+/** \internal \returns compute the derivative of the sample
+ * of Gamma(alpha, 1) random variable with respect to the parameter a
+ * gamma_sample_der_alpha(\a alpha, \a sample) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pgamma_sample_der_alpha(const Packet& alpha, const Packet& sample) {
+  using numext::gamma_sample_der_alpha;
+  return gamma_sample_der_alpha(alpha, sample);
+}
+
+/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigammac(const Packet& a, const Packet& x) {
+  using numext::igammac;
+  return igammac(a, x);
+}
+
+/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pbetainc(const Packet& a, const Packet& b, const Packet& x) {
+  using numext::betainc;
+  return betainc(a, b, x);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
new file mode 100644
index 00000000000..2d76692097f
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX_BESSELFUNCTIONS_H
+#define EIGEN_AVX_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX_BESSELFUNCTIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
new file mode 100644
index 00000000000..35e62a8acb2
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX_SPECIALFUNCTIONS_H
+#define EIGEN_AVX_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, perf)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, perf)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pndtri)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX_SPECIAL_FUNCTIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
new file mode 100644
index 00000000000..7dd3c3e5be6
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX512_BESSELFUNCTIONS_H
+#define EIGEN_AVX512_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX512_BESSELFUNCTIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
new file mode 100644
index 00000000000..79878f2b696
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX512_SPECIALFUNCTIONS_H
+#define EIGEN_AVX512_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, perf)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, perf)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pndtri)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX512_SPECIAL_FUNCTIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
new file mode 100644
index 00000000000..8f3468b8c4f
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
@@ -0,0 +1,317 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GPU_SPECIALFUNCTIONS_H
+#define EIGEN_GPU_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plgamma<float4>(const float4& a) {
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plgamma<double2>(const double2& a) {
+  using numext::lgamma;
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdigamma<float4>(const float4& a) {
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdigamma<double2>(const double2& a) {
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pzeta<float4>(const float4& x, const float4& q) {
+  using numext::zeta;
+  return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pzeta<double2>(const double2& x, const double2& q) {
+  using numext::zeta;
+  return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ppolygamma<float4>(const float4& n, const float4& x) {
+  using numext::polygamma;
+  return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ppolygamma<double2>(const double2& n, const double2& x) {
+  using numext::polygamma;
+  return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 perf<float4>(const float4& a) {
+  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 perf<double2>(const double2& a) {
+  using numext::erf;
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 perfc<float4>(const float4& a) {
+  using numext::erfc;
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 perfc<double2>(const double2& a) {
+  using numext::erfc;
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pndtri<float4>(const float4& a) {
+  using numext::ndtri;
+  return make_float4(ndtri(a.x), ndtri(a.y), ndtri(a.z), ndtri(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pndtri<double2>(const double2& a) {
+  using numext::ndtri;
+  return make_double2(ndtri(a.x), ndtri(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma<float4>(const float4& a, const float4& x) {
+  using numext::igamma;
+  return make_float4(igamma(a.x, x.x), igamma(a.y, x.y), igamma(a.z, x.z), igamma(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pigamma<double2>(const double2& a, const double2& x) {
+  using numext::igamma;
+  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma_der_a<float4>(const float4& a, const float4& x) {
+  using numext::igamma_der_a;
+  return make_float4(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y), igamma_der_a(a.z, x.z), igamma_der_a(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pigamma_der_a<double2>(const double2& a, const double2& x) {
+  using numext::igamma_der_a;
+  return make_double2(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pgamma_sample_der_alpha<float4>(const float4& alpha,
+                                                                             const float4& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_float4(gamma_sample_der_alpha(alpha.x, sample.x), gamma_sample_der_alpha(alpha.y, sample.y),
+                     gamma_sample_der_alpha(alpha.z, sample.z), gamma_sample_der_alpha(alpha.w, sample.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pgamma_sample_der_alpha<double2>(const double2& alpha,
+                                                                               const double2& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_double2(gamma_sample_der_alpha(alpha.x, sample.x), gamma_sample_der_alpha(alpha.y, sample.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigammac<float4>(const float4& a, const float4& x) {
+  using numext::igammac;
+  return make_float4(igammac(a.x, x.x), igammac(a.y, x.y), igammac(a.z, x.z), igammac(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pigammac<double2>(const double2& a, const double2& x) {
+  using numext::igammac;
+  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x) {
+  using numext::betainc;
+  return make_float4(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y), betainc(a.z, b.z, x.z), betainc(a.w, b.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x) {
+  using numext::betainc;
+  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0e<float4>(const float4& x) {
+  using numext::bessel_i0e;
+  return make_float4(bessel_i0e(x.x), bessel_i0e(x.y), bessel_i0e(x.z), bessel_i0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_i0e<double2>(const double2& x) {
+  using numext::bessel_i0e;
+  return make_double2(bessel_i0e(x.x), bessel_i0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0<float4>(const float4& x) {
+  using numext::bessel_i0;
+  return make_float4(bessel_i0(x.x), bessel_i0(x.y), bessel_i0(x.z), bessel_i0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_i0<double2>(const double2& x) {
+  using numext::bessel_i0;
+  return make_double2(bessel_i0(x.x), bessel_i0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1e<float4>(const float4& x) {
+  using numext::bessel_i1e;
+  return make_float4(bessel_i1e(x.x), bessel_i1e(x.y), bessel_i1e(x.z), bessel_i1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_i1e<double2>(const double2& x) {
+  using numext::bessel_i1e;
+  return make_double2(bessel_i1e(x.x), bessel_i1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1<float4>(const float4& x) {
+  using numext::bessel_i1;
+  return make_float4(bessel_i1(x.x), bessel_i1(x.y), bessel_i1(x.z), bessel_i1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_i1<double2>(const double2& x) {
+  using numext::bessel_i1;
+  return make_double2(bessel_i1(x.x), bessel_i1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0e<float4>(const float4& x) {
+  using numext::bessel_k0e;
+  return make_float4(bessel_k0e(x.x), bessel_k0e(x.y), bessel_k0e(x.z), bessel_k0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_k0e<double2>(const double2& x) {
+  using numext::bessel_k0e;
+  return make_double2(bessel_k0e(x.x), bessel_k0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0<float4>(const float4& x) {
+  using numext::bessel_k0;
+  return make_float4(bessel_k0(x.x), bessel_k0(x.y), bessel_k0(x.z), bessel_k0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_k0<double2>(const double2& x) {
+  using numext::bessel_k0;
+  return make_double2(bessel_k0(x.x), bessel_k0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1e<float4>(const float4& x) {
+  using numext::bessel_k1e;
+  return make_float4(bessel_k1e(x.x), bessel_k1e(x.y), bessel_k1e(x.z), bessel_k1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_k1e<double2>(const double2& x) {
+  using numext::bessel_k1e;
+  return make_double2(bessel_k1e(x.x), bessel_k1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1<float4>(const float4& x) {
+  using numext::bessel_k1;
+  return make_float4(bessel_k1(x.x), bessel_k1(x.y), bessel_k1(x.z), bessel_k1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_k1<double2>(const double2& x) {
+  using numext::bessel_k1;
+  return make_double2(bessel_k1(x.x), bessel_k1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j0<float4>(const float4& x) {
+  using numext::bessel_j0;
+  return make_float4(bessel_j0(x.x), bessel_j0(x.y), bessel_j0(x.z), bessel_j0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_j0<double2>(const double2& x) {
+  using numext::bessel_j0;
+  return make_double2(bessel_j0(x.x), bessel_j0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j1<float4>(const float4& x) {
+  using numext::bessel_j1;
+  return make_float4(bessel_j1(x.x), bessel_j1(x.y), bessel_j1(x.z), bessel_j1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_j1<double2>(const double2& x) {
+  using numext::bessel_j1;
+  return make_double2(bessel_j1(x.x), bessel_j1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y0<float4>(const float4& x) {
+  using numext::bessel_y0;
+  return make_float4(bessel_y0(x.x), bessel_y0(x.y), bessel_y0(x.z), bessel_y0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_y0<double2>(const double2& x) {
+  using numext::bessel_y0;
+  return make_double2(bessel_y0(x.x), bessel_y0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y1<float4>(const float4& x) {
+  using numext::bessel_y1;
+  return make_float4(bessel_y1(x.x), bessel_y1(x.y), bessel_y1(x.z), bessel_y1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_y1<double2>(const double2& x) {
+  using numext::bessel_y1;
+  return make_double2(bessel_y1(x.x), bessel_y1(x.y));
+}
+
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_GPU_SPECIALFUNCTIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
new file mode 100644
index 00000000000..70d90566d60
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
@@ -0,0 +1,54 @@
+#ifndef EIGEN_NEON_BESSELFUNCTIONS_H
+#define EIGEN_NEON_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD)                                              \
+  template <>                                                                             \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf METHOD<Packet8hf>(const Packet8hf& x) { \
+    const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x)));                  \
+    const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x)));                 \
+    return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));                              \
+  }                                                                                       \
+                                                                                          \
+  template <>                                                                             \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf METHOD<Packet4hf>(const Packet4hf& x) { \
+    return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x)));                               \
+  }
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y1)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_NEON_BESSELFUNCTIONS_H
diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
new file mode 100644
index 00000000000..5590d2bf814
--- /dev/null
+++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
@@ -0,0 +1,34 @@
+#ifndef EIGEN_NEON_SPECIALFUNCTIONS_H
+#define EIGEN_NEON_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD)                                              \
+  template <>                                                                             \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf METHOD<Packet8hf>(const Packet8hf& x) { \
+    const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x)));                  \
+    const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x)));                 \
+    return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));                              \
+  }                                                                                       \
+                                                                                          \
+  template <>                                                                             \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf METHOD<Packet4hf>(const Packet4hf& x) { \
+    return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x)));                               \
+  }
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(perf)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pndtri)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, perf)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_NEON_SPECIALFUNCTIONS_H

From d34b60fb7dc5670cd845c1bf65490156027b29a6 Mon Sep 17 00:00:00 2001
From: Matt <matthew.morley.ca@gmail.com>
Date: Sun, 29 Dec 2024 09:24:14 -0800
Subject: [PATCH 2/4] Switch wpical backend to GTSAM

---
 build.gradle                                  |   7 +-
 shared/gtsam.gradle                           |  13 +
 wpical/CMakeLists.txt                         |   1 +
 wpical/build.gradle                           |  14 +-
 .../src/main/native/cpp/cameracalibration.cpp |   4 +
 .../src/main/native/cpp/fieldcalibration.cpp  | 394 +++++-------------
 .../native/cpp/gtsam_meme/Pose3WithVariance.h |  27 ++
 .../cpp/gtsam_meme/Pose3WithVarianceStruct.h  |  52 +++
 .../src/main/native/cpp/gtsam_meme/README.md  | 127 ++++++
 .../main/native/cpp/gtsam_meme/TagDetection.h |  22 +
 .../cpp/gtsam_meme/TagDetectionStruct.h       |  58 +++
 .../native/cpp/gtsam_meme/gtsam_tag_map.cpp   | 116 ++++++
 .../native/cpp/gtsam_meme/gtsam_tag_map.h     |  75 ++++
 .../native/cpp/gtsam_meme/pose_converters.cpp |  44 ++
 .../native/cpp/gtsam_meme/pose_converters.h   |  16 +
 .../native/cpp/gtsam_meme/wpical_gtsam.cpp    | 218 ++++++++++
 .../main/native/cpp/gtsam_meme/wpical_gtsam.h |  41 ++
 wpical/src/test/native/cpp/test_calibrate.cpp |  98 ++---
 18 files changed, 974 insertions(+), 353 deletions(-)
 create mode 100644 shared/gtsam.gradle
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/Pose3WithVariance.h
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/Pose3WithVarianceStruct.h
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/README.md
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/TagDetection.h
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/TagDetectionStruct.h
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.cpp
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.h
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/pose_converters.cpp
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/pose_converters.h
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.cpp
 create mode 100644 wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.h

diff --git a/build.gradle b/build.gradle
index 2db70836038..b1c06fef465 100644
--- a/build.gradle
+++ b/build.gradle
@@ -29,8 +29,11 @@ wpilibVersioning.releaseMode = project.hasProperty('releaseMode')
 allprojects {
     repositories {
         maven {
-        url = 'https://frcmaven.wpi.edu/artifactory/ex-mvn'
-    }
+            url = 'https://frcmaven.wpi.edu/artifactory/ex-mvn'
+        }
+        maven {
+            url = "https://maven.photonvision.org/snapshots"
+        }
     }
     if (project.hasProperty('releaseMode')) {
         wpilibRepositories.addAllReleaseRepositories(it)
diff --git a/shared/gtsam.gradle b/shared/gtsam.gradle
new file mode 100644
index 00000000000..8912f4b5bde
--- /dev/null
+++ b/shared/gtsam.gradle
@@ -0,0 +1,13 @@
+nativeUtils {
+    nativeDependencyContainer {
+        gtsam(getNativeDependencyTypeClass('WPIStaticMavenDependency')) {
+            groupId = "edu.wpi.first.thirdparty.frc2025.gtsam"
+            artifactId = "gtsam-cpp"
+            headerClassifier = "headers"
+            sourceClassifier = "sources"
+            ext = "zip"
+            version = '4.3-1'
+            targetPlatforms.addAll(nativeUtils.wpi.platforms.desktopPlatforms)
+        }
+    }
+}
diff --git a/wpical/CMakeLists.txt b/wpical/CMakeLists.txt
index f07d52755ac..7567d22708e 100644
--- a/wpical/CMakeLists.txt
+++ b/wpical/CMakeLists.txt
@@ -89,6 +89,7 @@ target_link_libraries(
     ${OpenCV_LIBS}
     wpigui
     wpiutil
+    wpimath
     Ceres::ceres
 )
 
diff --git a/wpical/build.gradle b/wpical/build.gradle
index eea250c9036..14ac1e47ada 100644
--- a/wpical/build.gradle
+++ b/wpical/build.gradle
@@ -31,6 +31,7 @@ def wpilibVersionFileInput = file("src/main/generate/WPILibVersion.cpp.in")
 def wpilibVersionFileOutput = file("$buildDir/generated/main/cpp/WPILibVersion.cpp")
 
 apply from: "${rootDir}/shared/ceres.gradle"
+apply from: "${rootDir}/shared/gtsam.gradle"
 apply from: "${rootDir}/shared/opencv.gradle"
 
 task generateCppVersion() {
@@ -156,12 +157,14 @@ model {
                     it.buildable = false
                     return
                 }
+                lib project: ':wpimath', library: 'wpimath', linkage: 'static'
                 lib project: ':wpiutil', library: 'wpiutil', linkage: 'static'
                 lib project: ':wpigui', library: 'wpigui', linkage: 'static'
                 lib project: ':thirdparty:imgui_suite', library: 'imguiSuite', linkage: 'static'
                 lib project: ':apriltag', library: 'apriltag', linkage: 'static'
-                nativeUtils.useRequiredLibrary(it, 'ceres')
                 nativeUtils.useRequiredLibrary(it, 'opencv_static')
+                nativeUtils.useRequiredLibrary(it, 'gtsam')
+                nativeUtils.useRequiredLibrary(it, 'ceres')
                 it.cppCompiler.define 'GLOG_USE_GLOG_EXPORT'
                 if (it.targetPlatform.operatingSystem.isWindows()) {
                     it.cppCompiler.define('GLOG_DEPRECATED', '__declspec(deprecated)')
@@ -175,6 +178,9 @@ model {
                 } else if (it.targetPlatform.operatingSystem.isMacOsX()) {
                     it.linker.args << '-framework' << 'Metal' << '-framework' << 'MetalKit' << '-framework' << 'Cocoa' << '-framework' << 'IOKit' << '-framework' << 'CoreFoundation' << '-framework' << 'CoreVideo' << '-framework' << 'QuartzCore' << '-framework' << 'Accelerate'
                 } else {
+                    it.cppCompiler.args.add("-Wno-deprecated-copy")
+                    it.cppCompiler.args.add("-Wno-redundant-move")
+
                     it.linker.args << '-lX11' << "-lgfortran"
                     if (it.targetPlatform.name.startsWith('linuxarm')) {
                         it.linker.args << '-lGL'
@@ -211,10 +217,13 @@ model {
                 lib project: ':wpigui', library: 'wpigui', linkage: 'static'
                 lib project: ':thirdparty:imgui_suite', library: 'imguiSuite', linkage: 'static'
                 lib project: ':apriltag', library: 'apriltag', linkage: 'static'
+                lib project: ':wpimath', library: 'wpimath', linkage: 'static'
                 nativeUtils.useRequiredLibrary(it, 'ceres')
                 nativeUtils.useRequiredLibrary(it, 'opencv_static')
+                nativeUtils.useRequiredLibrary(it, 'gtsam')
                 it.cppCompiler.define('PROJECT_ROOT_PATH', testResources)
                 it.cppCompiler.define 'GLOG_USE_GLOG_EXPORT'
+
                 if (it.targetPlatform.name == nativeUtils.wpi.platforms.windowsarm64) {
                     // https://developercommunity.visualstudio.com/t/-imp-std-init-once-complete-unresolved-external-sy/1684365#T-N10041864
                     it.linker.args << '/alternatename:__imp___std_init_once_complete=__imp_InitOnceComplete'
@@ -240,6 +249,9 @@ model {
                     if (it.targetPlatform.name.startsWith('linuxarm')) {
                         it.linker.args << '-lGL'
                     }
+
+                    it.cppCompiler.args.add("-Wno-deprecated-copy")
+                    it.cppCompiler.args.add("-Wno-redundant-move")
                 }
             }
         }
diff --git a/wpical/src/main/native/cpp/cameracalibration.cpp b/wpical/src/main/native/cpp/cameracalibration.cpp
index 270de38af92..b1ab23e1dee 100644
--- a/wpical/src/main/native/cpp/cameracalibration.cpp
+++ b/wpical/src/main/native/cpp/cameracalibration.cpp
@@ -312,7 +312,11 @@ int cameracalibration::calibrate(const std::string& input_video,
   cv::Size boardSize(board_width - 1, board_height - 1);
   cv::Size imagerSize(imagerWidthPixels, imagerHeightPixels);
 
+  int frameIdx = 0;
   while (video_capture.grab()) {
+    std::cout << "Frame index " << frameIdx << std::endl;
+    frameIdx++;
+
     cv::Mat frame;
     video_capture.retrieve(frame);
 
diff --git a/wpical/src/main/native/cpp/fieldcalibration.cpp b/wpical/src/main/native/cpp/fieldcalibration.cpp
index 1f3ff6970a3..e646f9a1a0f 100644
--- a/wpical/src/main/native/cpp/fieldcalibration.cpp
+++ b/wpical/src/main/native/cpp/fieldcalibration.cpp
@@ -16,7 +16,8 @@
 
 #include <Eigen/Core>
 #include <Eigen/Geometry>
-#include <ceres/ceres.h>
+#include <frc/apriltag/AprilTagDetection.h>
+#include <frc/apriltag/AprilTagFieldLayout.h>
 #include <opencv2/calib3d.hpp>
 #include <opencv2/core/eigen.hpp>
 #include <opencv2/core/utils/logger.hpp>
@@ -26,72 +27,15 @@
 #include <wpi/json.h>
 
 #include "apriltag.h"
+#include "gtsam_meme/wpical_gtsam.h"
 #include "tag36h11.h"
 
-struct Pose {
-  Eigen::Vector3d p;
-  Eigen::Quaterniond q;
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+struct CameraModel {
+  Eigen::Matrix<double, 3, 3> intrinsic_matrix;
+  Eigen::Matrix<double, 8, 1> distortion_coefficients;
 };
 
-struct Constraint {
-  int id_begin;
-  int id_end;
-  Pose t_begin_end;
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-};
-
-class PoseGraphError {
- public:
-  explicit PoseGraphError(Pose t_ab_observed)
-      : m_t_ab_observed(std::move(t_ab_observed)) {}
-
-  template <typename T>
-  bool operator()(const T* const p_a_ptr, const T* const q_a_ptr,
-                  const T* const p_b_ptr, const T* const q_b_ptr,
-                  T* residuals_ptr) const {
-    // Tag A
-    Eigen::Map<const Eigen::Matrix<T, 3, 1>> p_a(p_a_ptr);
-    Eigen::Map<const Eigen::Quaternion<T>> q_a(q_a_ptr);
-
-    // Tag B
-    Eigen::Map<const Eigen::Matrix<T, 3, 1>> p_b(p_b_ptr);
-    Eigen::Map<const Eigen::Quaternion<T>> q_b(q_b_ptr);
-
-    // Rotation between tag A to tag B
-    Eigen::Quaternion<T> q_a_inverse = q_a.conjugate();
-    Eigen::Quaternion<T> q_ab_estimated = q_a_inverse * q_b;
-
-    // Displacement between tag A and tag B in tag A's frame
-    Eigen::Matrix<T, 3, 1> p_ab_estimated = q_a_inverse * (p_b - p_a);
-
-    // Error between the orientations
-    Eigen::Quaternion<T> delta_q =
-        m_t_ab_observed.q.template cast<T>() * q_ab_estimated.conjugate();
-
-    // Residuals
-    Eigen::Map<Eigen::Matrix<T, 6, 1>> residuals(residuals_ptr);
-    residuals.template block<3, 1>(0, 0) =
-        p_ab_estimated - m_t_ab_observed.p.template cast<T>();
-    residuals.template block<3, 1>(3, 0) = T(2.0) * delta_q.vec();
-
-    return true;
-  }
-
-  static ceres::CostFunction* Create(const Pose& t_ab_observed) {
-    return new ceres::AutoDiffCostFunction<PoseGraphError, 6, 3, 4, 3, 4>(
-        new PoseGraphError(t_ab_observed));
-  }
-
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-
- private:
-  const Pose m_t_ab_observed;
-};
-
-const double tagSizeMeters = 0.1651;
-
-inline cameracalibration::CameraModel load_camera_model(std::string path) {
+inline CameraModel load_camera_model(std::string path) {
   Eigen::Matrix<double, 3, 3> camera_matrix;
   Eigen::Matrix<double, 8, 1> camera_distortion;
 
@@ -146,12 +90,11 @@ inline cameracalibration::CameraModel load_camera_model(std::string path) {
     }
   }
 
-  cameracalibration::CameraModel camera_model{
-      camera_matrix, camera_distortion, json_data["avg_reprojection_error"]};
+  CameraModel camera_model{camera_matrix, camera_distortion};
   return camera_model;
 }
 
-inline cameracalibration::CameraModel load_camera_model(wpi::json json_data) {
+inline CameraModel load_camera_model(wpi::json json_data) {
   // Camera matrix
   Eigen::Matrix<double, 3, 3> camera_matrix;
 
@@ -172,86 +115,10 @@ inline cameracalibration::CameraModel load_camera_model(wpi::json json_data) {
     }
   }
 
-  cameracalibration::CameraModel camera_model{
-      camera_matrix, camera_distortion, json_data["avg_reprojection_error"]};
+  CameraModel camera_model{camera_matrix, camera_distortion};
   return camera_model;
 }
 
-inline std::map<int, wpi::json> load_ideal_map(std::string path) {
-  std::ifstream file(path);
-  wpi::json json_data = wpi::json::parse(file);
-  std::map<int, wpi::json> ideal_map;
-
-  for (const auto& element : json_data["tags"]) {
-    ideal_map[element["ID"]] = element;
-  }
-
-  return ideal_map;
-}
-
-Eigen::Matrix<double, 4, 4> get_tag_transform(
-    std::map<int, wpi::json>& ideal_map, int tag_id) {
-  Eigen::Matrix<double, 4, 4> transform =
-      Eigen::Matrix<double, 4, 4>::Identity();
-
-  transform.block<3, 3>(0, 0) =
-      Eigen::Quaternion<double>(
-          ideal_map[tag_id]["pose"]["rotation"]["quaternion"]["W"],
-          ideal_map[tag_id]["pose"]["rotation"]["quaternion"]["X"],
-          ideal_map[tag_id]["pose"]["rotation"]["quaternion"]["Y"],
-          ideal_map[tag_id]["pose"]["rotation"]["quaternion"]["Z"])
-          .toRotationMatrix();
-
-  transform(0, 3) = ideal_map[tag_id]["pose"]["translation"]["x"];
-  transform(1, 3) = ideal_map[tag_id]["pose"]["translation"]["y"];
-  transform(2, 3) = ideal_map[tag_id]["pose"]["translation"]["z"];
-
-  return transform;
-}
-
-inline Eigen::Matrix<double, 4, 4> estimate_tag_pose(
-    apriltag_detection_t* tag_detection,
-    const Eigen::Matrix<double, 3, 3>& camera_matrix,
-    const Eigen::Matrix<double, 8, 1>& camera_distortion, double tag_size) {
-  cv::Mat camera_matrix_cv;
-  cv::Mat camera_distortion_cv;
-
-  cv::eigen2cv(camera_matrix, camera_matrix_cv);
-  cv::eigen2cv(camera_distortion, camera_distortion_cv);
-
-  std::vector<cv::Point2f> points_2d = {
-      cv::Point2f(tag_detection->p[0][0], tag_detection->p[0][1]),
-      cv::Point2f(tag_detection->p[1][0], tag_detection->p[1][1]),
-      cv::Point2f(tag_detection->p[2][0], tag_detection->p[2][1]),
-      cv::Point2f(tag_detection->p[3][0], tag_detection->p[3][1])};
-
-  std::vector<cv::Point3f> points_3d_box_base = {
-      cv::Point3f(-tag_size / 2.0, tag_size / 2.0, 0.0),
-      cv::Point3f(tag_size / 2.0, tag_size / 2.0, 0.0),
-      cv::Point3f(tag_size / 2.0, -tag_size / 2.0, 0.0),
-      cv::Point3f(-tag_size / 2.0, -tag_size / 2.0, 0.0)};
-
-  std::vector<double> r_vec;
-  std::vector<double> t_vec;
-
-  cv::solvePnP(points_3d_box_base, points_2d, camera_matrix_cv,
-               camera_distortion_cv, r_vec, t_vec, false, cv::SOLVEPNP_SQPNP);
-
-  cv::Mat r_mat;
-  cv::Rodrigues(r_vec, r_mat);
-
-  Eigen::Matrix<double, 4, 4> camera_to_tag{
-      {r_mat.at<double>(0, 0), r_mat.at<double>(0, 1), r_mat.at<double>(0, 2),
-       t_vec.at(0)},
-      {r_mat.at<double>(1, 0), r_mat.at<double>(1, 1), r_mat.at<double>(1, 2),
-       t_vec.at(1)},
-      {r_mat.at<double>(2, 0), r_mat.at<double>(2, 1), r_mat.at<double>(2, 2),
-       t_vec.at(2)},
-      {0.0, 0.0, 0.0, 1.0}};
-
-  return camera_to_tag;
-}
-
 inline void draw_tag_cube(cv::Mat& frame,
                           Eigen::Matrix<double, 4, 4> camera_to_tag,
                           const Eigen::Matrix<double, 3, 3>& camera_matrix,
@@ -312,15 +179,18 @@ inline void draw_tag_cube(cv::Mat& frame,
   }
 }
 
+/**
+ * Convert a video file to a list of keyframes
+ */
 inline bool process_video_file(
     apriltag_detector_t* tag_detector,
     const Eigen::Matrix<double, 3, 3>& camera_matrix,
     const Eigen::Matrix<double, 8, 1>& camera_distortion, double tag_size,
-    const std::string& path,
-    std::map<int, Pose, std::less<int>,
-             Eigen::aligned_allocator<std::pair<const int, Pose>>>& poses,
-    std::vector<Constraint, Eigen::aligned_allocator<Constraint>>& constraints,
-    bool show_debug_window) {
+    const std::string& path, gtsam::Key& keyframe,
+    wpical::KeyframeMap& outputMap, bool show_debug_window) {
+  // clear inputs
+  outputMap.clear();
+
   if (show_debug_window) {
     cv::namedWindow("Processing Frame", cv::WINDOW_NORMAL);
   }
@@ -331,15 +201,13 @@ inline bool process_video_file(
     return false;
   }
 
+  // Reuse mats if we can - allocatiosn are expensive
   cv::Mat frame;
   cv::Mat frame_gray;
   cv::Mat frame_debug;
 
-  int frame_num = 0;
-
   while (video_input.read(frame)) {
-    std::cout << "Processing " << path << " - Frame " << frame_num++
-              << std::endl;
+    std::cout << "Processing " << path << " - Frame " << keyframe << std::endl;
 
     // Convert color frame to grayscale frame
     cv::cvtColor(frame, frame_gray, cv::COLOR_BGR2GRAY);
@@ -359,61 +227,41 @@ inline bool process_video_file(
       continue;
     }
 
-    // Find detection with the smallest tag ID
-    apriltag_detection_t* tag_detection_min = nullptr;
-    zarray_get(tag_detections, 0, &tag_detection_min);
-
-    for (int i = 0; i < zarray_size(tag_detections); i++) {
-      apriltag_detection_t* tag_detection_i;
-      zarray_get(tag_detections, i, &tag_detection_i);
-
-      if (tag_detection_i->id < tag_detection_min->id) {
-        tag_detection_min = tag_detection_i;
-      }
-    }
-
-    Eigen::Matrix<double, 4, 4> camera_to_tag_min = estimate_tag_pose(
-        tag_detection_min, camera_matrix, camera_distortion, tag_size);
-
-    // Find transformation from smallest tag ID
+    std::vector<TagDetection> tagsThisKeyframe;
     for (int i = 0; i < zarray_size(tag_detections); i++) {
       apriltag_detection_t* tag_detection_i;
       zarray_get(tag_detections, i, &tag_detection_i);
 
-      // Add tag ID to poses
-      if (poses.find(tag_detection_i->id) == poses.end()) {
-        poses[tag_detection_i->id] = {Eigen::Vector3d(0.0, 0.0, 0.0),
-                                      Eigen::Quaterniond(1.0, 0.0, 0.0, 0.0)};
+      // Convert to our data type. I don't love how complicated this is.
+      auto atag = reinterpret_cast<frc::AprilTagDetection*>(tag_detection_i);
+      auto tag_corners_cv = std::vector<cv::Point2d>{};
+      for (int corn = 0; corn < 4; corn++) {
+        tag_corners_cv.emplace_back(atag->GetCorner(corn).x,
+                                    atag->GetCorner(corn).y);
       }
 
-      // Estimate camera to tag pose
-      Eigen::Matrix<double, 4, 4> caamera_to_tag = estimate_tag_pose(
-          tag_detection_i, camera_matrix, camera_distortion, tag_size);
+      // Undistort so gtsam doesn't have to deal with distortion
+      cv::Mat camCalCv(camera_matrix.rows(), camera_matrix.cols(), CV_64F);
+      cv::Mat camDistCv(camera_distortion.rows(), camera_distortion.cols(),
+                        CV_64F);
 
-      // Draw debug cube
-      if (show_debug_window) {
-        draw_tag_cube(frame_debug, caamera_to_tag, camera_matrix,
-                      camera_distortion, tag_size);
-      }
+      cv::eigen2cv(camera_matrix, camCalCv);
+      cv::eigen2cv(camera_distortion, camDistCv);
 
-      // Skip finding transformation from smallest tag ID to itself
-      if (tag_detection_i->id == tag_detection_min->id) {
-        continue;
-      }
+      cv::undistortImagePoints(tag_corners_cv, tag_corners_cv, camCalCv,
+                               camDistCv);
 
-      Eigen::Matrix<double, 4, 4> tag_min_to_tag =
-          camera_to_tag_min.inverse() * caamera_to_tag;
+      TagDetection tag;
+      tag.id = atag->GetId();
 
-      // Constraint
-      Constraint constraint;
-      constraint.id_begin = tag_detection_min->id;
-      constraint.id_end = tag_detection_i->id;
-      constraint.t_begin_end.p = tag_min_to_tag.block<3, 1>(0, 3);
-      constraint.t_begin_end.q =
-          Eigen::Quaterniond(tag_min_to_tag.block<3, 3>(0, 0));
+      tag.corners.resize(4);
+      std::transform(tag_corners_cv.begin(), tag_corners_cv.end(),
+                     tag.corners.begin(),
+                     [](const auto& it) { return TargetCorner{it.x, it.y}; });
 
-      constraints.push_back(constraint);
+      tagsThisKeyframe.push_back(tag);
     }
+    outputMap[keyframe] = tagsThisKeyframe;
 
     apriltag_detections_destroy(tag_detections);
 
@@ -422,6 +270,9 @@ inline bool process_video_file(
       cv::imshow("Processing Frame", frame_debug);
       cv::waitKey(1);
     }
+
+    // Keep track of the frame number
+    keyframe++;
   }
 
   video_input.release();
@@ -437,9 +288,9 @@ int fieldcalibration::calibrate(std::string input_dir_path,
                                 std::string camera_model_path,
                                 std::string ideal_map_path, int pinned_tag_id,
                                 bool show_debug_window) {
-  // Silence OpenCV logging
-  cv::utils::logging::setLogLevel(
-      cv::utils::logging::LogLevel::LOG_LEVEL_SILENT);
+  // // Silence OpenCV logging
+  // cv::utils::logging::setLogLevel(
+  //     cv::utils::logging::LogLevel::LOG_LEVEL_SILENT);
 
   // Load camera model
   Eigen::Matrix3d camera_matrix;
@@ -453,19 +304,25 @@ int fieldcalibration::calibrate(std::string input_dir_path,
     return 1;
   }
 
-  wpi::json json = wpi::json::parse(std::ifstream(ideal_map_path));
-  if (!json.contains("tags")) {
-    return 1;
-  }
+  // Convert intrinsics to gtsam-land. Order fx fy s u0 v0
+  gtsam::Cal3_S2 gtsam_cal{
+      camera_matrix(0, 0), camera_matrix(1, 1), 0,
+      camera_matrix(0, 2), camera_matrix(1, 2),
+  };
 
-  // Load ideal field map
-  std::map<int, wpi::json> ideal_map;
+  // Load the seed field layout from disk
+  frc::AprilTagFieldLayout idealTagLayout;
   try {
-    ideal_map = load_ideal_map(ideal_map_path);
+    wpi::json json = wpi::json::parse(std::ifstream(ideal_map_path));
+    idealTagLayout = json.get<frc::AprilTagFieldLayout>();
   } catch (...) {
+    std::cerr << "Could not deserialize" << ideal_map_path << std::endl;
     return 1;
   }
 
+  // Load ideal field map
+  std::map<int, wpi::json> ideal_map;
+
   // Apriltag detector
   apriltag_detector_t* tag_detector = apriltag_detector_create();
   tag_detector->nthreads = 8;
@@ -473,23 +330,25 @@ int fieldcalibration::calibrate(std::string input_dir_path,
   apriltag_family_t* tag_family = tag36h11_create();
   apriltag_detector_add_family(tag_detector, tag_family);
 
-  // Find tag poses
-  std::map<int, Pose, std::less<int>,
-           Eigen::aligned_allocator<std::pair<const int, Pose>>>
-      poses;
-  std::vector<Constraint, Eigen::aligned_allocator<Constraint>> constraints;
+  // Write down keyframes from all our video files
+  wpical::KeyframeMap outputMap;
+
+  gtsam::Key keyframe{gtsam::symbol_shorthand::X(0)};
+
+  constexpr units::meter_t TAG_SIZE = 0.1651_m;
 
   for (const auto& entry :
        std::filesystem::directory_iterator(input_dir_path)) {
+    // Ignore hidden files
     if (entry.path().filename().string()[0] == '.') {
       continue;
     }
 
     const std::string path = entry.path().string();
 
-    bool success = process_video_file(tag_detector, camera_matrix,
-                                      camera_distortion, tagSizeMeters, path,
-                                      poses, constraints, show_debug_window);
+    bool success = process_video_file(
+        tag_detector, camera_matrix, camera_distortion, TAG_SIZE.to<double>(),
+        path, keyframe, outputMap, show_debug_window);
 
     if (!success) {
       std::cout << "Unable to process video " << path << std::endl;
@@ -497,100 +356,33 @@ int fieldcalibration::calibrate(std::string input_dir_path,
     }
   }
 
-  // Build optimization problem
-  ceres::Problem problem;
-  ceres::Manifold* quaternion_manifold = new ceres::EigenQuaternionManifold;
+  wpical::GtsamApriltagMap layoutGuess{idealTagLayout, TAG_SIZE};
 
-  for (const auto& constraint : constraints) {
-    auto pose_begin_iter = poses.find(constraint.id_begin);
-    auto pose_end_iter = poses.find(constraint.id_end);
+  // TODO - handle constraints more generally (ie, multiple, tunable)
 
-    ceres::CostFunction* cost_function =
-        PoseGraphError::Create(constraint.t_begin_end);
+  // Noise on our pose prior. order is rx, ry, rz, tx, ty, tz, and units are
+  // [rad] and [m].
+  // Guess ~1 degree and 5 mm for fun.
+  using gtsam::Vector3;
+  using gtsam::Vector6;
+  Vector6 sigmas;
+  sigmas << Vector3::Constant(0.015), Vector3::Constant(0.005);
+  gtsam::SharedNoiseModel posePriorNoise =
+      gtsam::noiseModel::Diagonal::Sigmas(sigmas);
+  std::map<int32_t, std::pair<gtsam::Pose3, gtsam::SharedNoiseModel>> fixedTags{
+      {pinned_tag_id,
+       {layoutGuess.WorldToTag(pinned_tag_id).value(), posePriorNoise}}};
 
-    problem.AddResidualBlock(cost_function, nullptr,
-                             pose_begin_iter->second.p.data(),
-                             pose_begin_iter->second.q.coeffs().data(),
-                             pose_end_iter->second.p.data(),
-                             pose_end_iter->second.q.coeffs().data());
+  // one pixel in u and v - TODO don't hardcode this
+  gtsam::SharedNoiseModel cameraNoise{
+      gtsam::noiseModel::Isotropic::Sigma(2, 1.0)};
 
-    problem.SetManifold(pose_begin_iter->second.q.coeffs().data(),
-                        quaternion_manifold);
-    problem.SetManifold(pose_end_iter->second.q.coeffs().data(),
-                        quaternion_manifold);
-  }
-
-  // Pin tag
-  auto pinned_tag_iter = poses.find(pinned_tag_id);
-  if (pinned_tag_iter != poses.end()) {
-    problem.SetParameterBlockConstant(pinned_tag_iter->second.p.data());
-    problem.SetParameterBlockConstant(
-        pinned_tag_iter->second.q.coeffs().data());
-  }
+  auto calResult = wpical::OptimizeLayout(layoutGuess, outputMap, gtsam_cal,
+                                          fixedTags, cameraNoise);
 
-  // Solve
-  ceres::Solver::Options options;
-  options.max_num_iterations = 200;
-  options.linear_solver_type = ceres::SPARSE_NORMAL_CHOLESKY;
-  options.num_threads = 10;
-
-  ceres::Solver::Summary summary;
-  ceres::Solve(options, &problem, &summary);
-
-  std::cout << summary.BriefReport() << std::endl;
-
-  // Output
-  std::map<int, wpi::json> observed_map = ideal_map;
-
-  Eigen::Matrix<double, 4, 4> correction_a;
-  correction_a << 0, 0, -1, 0, 1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 1;
-
-  Eigen::Matrix<double, 4, 4> correction_b;
-  correction_b << 0, 1, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 1;
-
-  Eigen::Matrix<double, 4, 4> pinned_tag_transform =
-      get_tag_transform(ideal_map, pinned_tag_id);
-
-  for (const auto& [tag_id, pose] : poses) {
-    // Transformation from pinned tag
-    Eigen::Matrix<double, 4, 4> transform =
-        Eigen::Matrix<double, 4, 4>::Identity();
-
-    transform.block<3, 3>(0, 0) = pose.q.toRotationMatrix();
-    transform.block<3, 1>(0, 3) = pose.p;
-
-    // Transformation from world
-    Eigen::Matrix<double, 4, 4> corrected_transform =
-        pinned_tag_transform * correction_a * transform * correction_b;
-    Eigen::Quaternion<double> corrected_transform_q(
-        corrected_transform.block<3, 3>(0, 0));
-
-    observed_map[tag_id]["pose"]["translation"]["x"] =
-        corrected_transform(0, 3);
-    observed_map[tag_id]["pose"]["translation"]["y"] =
-        corrected_transform(1, 3);
-    observed_map[tag_id]["pose"]["translation"]["z"] =
-        corrected_transform(2, 3);
-
-    observed_map[tag_id]["pose"]["rotation"]["quaternion"]["X"] =
-        corrected_transform_q.x();
-    observed_map[tag_id]["pose"]["rotation"]["quaternion"]["Y"] =
-        corrected_transform_q.y();
-    observed_map[tag_id]["pose"]["rotation"]["quaternion"]["Z"] =
-        corrected_transform_q.z();
-    observed_map[tag_id]["pose"]["rotation"]["quaternion"]["W"] =
-        corrected_transform_q.w();
-  }
-
-  wpi::json observed_map_json;
-
-  for (const auto& [tag_id, tag_json] : observed_map) {
-    observed_map_json["tags"].push_back(tag_json);
-  }
+  wpi::json observed_map_json = calResult.optimizedLayout;
 
-  observed_map_json["field"] = {
-      {"length", static_cast<double>(json.at("field").at("length"))},
-      {"width", static_cast<double>(json.at("field").at("width"))}};
+  // TODO add my tags to the output map
 
   std::ofstream output_file(output_file_path);
   output_file << observed_map_json.dump(4) << std::endl;
diff --git a/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVariance.h b/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVariance.h
new file mode 100644
index 00000000000..5f9694dce0c
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVariance.h
@@ -0,0 +1,27 @@
+// Copyright (c) FIRST and other WPILib contributors.
+// Open Source Software; you can modify and/or share it under the terms of
+// the WPILib BSD license file in the root directory of this project.
+
+#pragma once
+
+#include <Eigen/Core>
+#include <frc/geometry/Pose3d.h>
+
+struct Pose3WithVariance {
+  frc::Pose3d pose;
+  // in order rx ry rz tx ty tz. Units are SI units
+  std::array<double, 6> covariance;
+
+  static Pose3WithVariance FromEigen(frc::Pose3d pose,
+                                     Eigen::MatrixXd covariance) {
+    std::array<double, 6> cov;
+
+    for (size_t i = 0; i < 6; i++) {
+      cov[i] = covariance(i);
+    }
+
+    return {pose, cov};
+  }
+};
+
+#include "Pose3WithVarianceStruct.h"
diff --git a/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVarianceStruct.h b/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVarianceStruct.h
new file mode 100644
index 00000000000..25a942b6468
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVarianceStruct.h
@@ -0,0 +1,52 @@
+// Copyright (c) FIRST and other WPILib contributors.
+// Open Source Software; you can modify and/or share it under the terms of
+// the WPILib BSD license file in the root directory of this project.
+
+#pragma once
+
+#include "Pose3WithVariance.h"
+
+template <>
+struct WPILIB_DLLEXPORT wpi::Struct<Pose3WithVariance> {
+  static constexpr size_t poseSize = wpi::Struct<frc::Pose3d>::GetSize();
+
+  static constexpr std::string_view GetTypeName() {
+    return "Pose3WithVariance";
+  }
+
+  static constexpr size_t GetSize() { return poseSize + (6 * sizeof(double)); }
+
+  static constexpr std::string_view GetSchema() {
+    return "Pose3d pose; double covariance[6]";
+  }
+
+  static Pose3WithVariance Unpack(std::span<const uint8_t> data) {
+    return Pose3WithVariance{
+        wpi::UnpackStruct<frc::Pose3d, 0>(data),
+        {
+            wpi::UnpackStruct<double, poseSize + 0 * sizeof(double)>(data),
+            wpi::UnpackStruct<double, poseSize + 1 * sizeof(double)>(data),
+            wpi::UnpackStruct<double, poseSize + 2 * sizeof(double)>(data),
+            wpi::UnpackStruct<double, poseSize + 3 * sizeof(double)>(data),
+            wpi::UnpackStruct<double, poseSize + 4 * sizeof(double)>(data),
+            wpi::UnpackStruct<double, poseSize + 5 * sizeof(double)>(data),
+        }};
+  }
+
+  static void Pack(std::span<uint8_t> data, const Pose3WithVariance& value) {
+    wpi::PackStruct<0>(data, value.pose);
+    wpi::PackStruct<poseSize + sizeof(double) * 0>(data, value.covariance[0]);
+    wpi::PackStruct<poseSize + sizeof(double) * 1>(data, value.covariance[1]);
+    wpi::PackStruct<poseSize + sizeof(double) * 2>(data, value.covariance[2]);
+    wpi::PackStruct<poseSize + sizeof(double) * 3>(data, value.covariance[3]);
+    wpi::PackStruct<poseSize + sizeof(double) * 4>(data, value.covariance[4]);
+    wpi::PackStruct<poseSize + sizeof(double) * 5>(data, value.covariance[5]);
+  }
+
+  static void ForEachNested(
+      std::invocable<std::string_view, std::string_view> auto fn) {
+    wpi::ForEachStructSchema<frc::Pose3d>(fn);
+  }
+};
+
+static_assert(wpi::StructSerializable<Pose3WithVariance>);
diff --git a/wpical/src/main/native/cpp/gtsam_meme/README.md b/wpical/src/main/native/cpp/gtsam_meme/README.md
new file mode 100644
index 00000000000..16b6c811e5f
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/README.md
@@ -0,0 +1,127 @@
+# SFM Mapper
+
+## Resources
+
+- Factor Graphs and GTSAM. https://gtsam.org/tutorials/intro.html#magicparlabel-65377
+- Factor Graphs for Robot Perception. Dellaert & Kaess. https://www.cs.cmu.edu/~kaess/pub/Dellaert17fnt.pdf
+- An Introduction to Factor Graphs. Loeliger. https://people.binf.ku.dk/~thamelry/MLSB08/hal.pdf (a good slide deck)
+
+The TLDR of the TLDR (really. Go skim at least [Factor Graphs and GTSAM](https://gtsam.org/tutorials/intro.html#magicparlabel-65411)) is that a factor graph is a graph of variables (things we want to optimize, like robot poses or landmark pose) connected by "factors" which describe relationships between our variables. For example, this factor graph shows 3 robot state variables (x1, x2, and x3; the white circles). The relationship between these states is described by factors (black filled in circles) which encode information about robot pose delta between states (in the backbone), and an external measurement from a sensor like GPS.
+
+![](https://gtsam.org/tutorials/intro-images/5_Users_dellaert_git_github_doc_images_FactorGraph2.png)
+
+The primary difference between this and tradidional nonlinear least squares is that we don't have to explicitly build a cost function. Instead, we're interested in finding the maximum likelihood solution, given a factor graph and *noise models* (see Dellaert Ch. 2.2), which describe how sure we are about measurements.
+
+Terminology
+- Key: A unique identifier for a state, such as X0, L0, etc. Really a size_t in a typedef.
+- Expression: an autodiff expression. Can be constant or constructed using a Key
+
+## Problem Formulation
+
+We build up a factor graph of landmarks (tags), and estimated camera poses in the world. Our decision variables are:
+- Poses of tags in the world (located sufficiently near an initial guess that things don't diverge)
+  - This is the pose of the tag origin. The 4 tag corners are offset from this tag origin by +- 3 inches in x and z.
+- Poses from where camera observations were recorded from
+
+Our factors:
+- From camera observation pose -> each tag corner we can see (in pixels). Noise model is is tag corner location in pixels. The tag corner is a GTSAM "expression" that is (tag origin + (tag origin -> corner N)) for each corner.
+- A "pose prior factor" on up to N many tags describing how sure we are in its pose in SE(3). In this example, we measured tag 8's location with a tape measure, but didn't measure any other ones. We could add prior factors to as many tags as we want - GTSAM will maximize the overall solution likelihood given this information.
+
+
+```mermaid
+graph TD;
+    x0[Observation 0]
+    x1[Observation 1]
+    x2[Observation 2]
+    x3[Observation 3]
+
+    l6[Tag 6 Corners]
+    l7[Tag 7 Corners]
+    l8[Tag 8 Corners]
+    l9[Tag 9 Corners]
+    l10[Tag 10 Corners]
+    l15[Tag 15 Corners]
+
+    l6o[Tag 6 Origin]
+    l7o[Tag 7 Origin]
+    l8o[Tag 8 Origin]
+    l9o[Tag 9 Origin]
+    l10o[Tag 10 Origin]
+    l15o[Tag 15 Origin]
+
+    prior(Tag 8 pose prior factor)
+
+    l6 --- l6o
+    l7 --- l7o
+    l8 --- l8o
+    l9 --- l9o
+    l10 --- l10o
+    l15 --- l15o
+
+    x0 <-- F0 --> l6
+    x0 <-- F1 --> l7
+    x0 <-- F2 --> l8
+
+    x1 <-- F3 --> l6
+    x1 <-- F4 --> l7
+    x1 <-- F5 --> l8
+
+    x2 <-- F6 --> l7
+    x2 <-- F7 --> l8
+    x2 <-- F8 --> l9
+    x2 <-- F9 --> l10
+
+    x3 <-- F10 --> l9
+    x3 <-- F11 --> l10
+    x3 <-- F12 --> l15
+
+    prior --- l8o
+```
+
+In code this ends up boiling down to something like this for tag observations:
+
+```cpp
+// Get a list of Expressions for each corner's location in the field given an
+// Expression for the tag origin pose in the field
+vector<Point3_> WorldToCornersFactor(Pose3_ worldTtag) {
+  vector<Point3_> out;
+  for (const auto &p : tagToCorners) {
+    out.push_back(gtsam::transformFrom(worldTtag, p));
+  }
+
+  return out;
+}
+
+// later
+
+Isotropic::shared_ptr cameraNoise{
+    Isotropic::Sigma(2, 1.0)}; // one pixel in u and v
+
+// Add all our tag observation factors
+for (const auto &[stateKey, tags] : keyframes) {
+  for (const TagDetection &tag : tags) {
+    // Where our tag's corners are in the field
+    auto worldPcorners = TagModel::WorldToCornersFactor(L(tag.id));
+
+    // add all of the tag's corners
+    constexpr int NUM_CORNERS = 4;
+    for (size_t i = 0; i < NUM_CORNERS; i++) {
+      // Decision variable - where our camera is in the world
+      const Pose3_ worldTbody_fac(stateKey);
+
+      // Where we'd predict the i'th corner of the tag to be,
+      // given camera located at (worldTbody_fac * robotTcamera) observing worldPcorners[i]
+      const auto prediction = PredictLandmarkImageLocationFactor(
+          worldTbody_fac, robotTcamera, cameraCal, worldPcorners[i]);
+
+      // where we saw the i'th corner in the image
+      Point2 measurement = {tag.corners[i].x, tag.corners[i].y};
+
+      // Add this prediction/measurement pair to our graph
+      graph.addExpressionFactor(prediction, measurement, cameraNoise);
+    }
+  }
+}
+```
+
+And... that's about it! We just need to provide a sane initial guess for the solver (I used solvepnp for camera poses, and the starting tag map for tag poses), as well as add prior factors to at least one tag to anchor our map, and throw this at Dogleg or L-M or another traditional nonlinear optimization algorithm. GTSAM will take care of converting our factor graph into a Bayes net (Dellaert, ch 1.3) and then maximizing the product of all factor graph potentials.
diff --git a/wpical/src/main/native/cpp/gtsam_meme/TagDetection.h b/wpical/src/main/native/cpp/gtsam_meme/TagDetection.h
new file mode 100644
index 00000000000..cc339590482
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/TagDetection.h
@@ -0,0 +1,22 @@
+// Copyright (c) FIRST and other WPILib contributors.
+// Open Source Software; you can modify and/or share it under the terms of
+// the WPILib BSD license file in the root directory of this project.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <utility>
+#include <vector>
+
+struct TargetCorner {
+  double x;
+  double y;
+};
+
+struct TagDetection {
+  int32_t id;
+  std::vector<TargetCorner> corners;
+
+  inline int GetFiducialId() const { return id; }
+};
diff --git a/wpical/src/main/native/cpp/gtsam_meme/TagDetectionStruct.h b/wpical/src/main/native/cpp/gtsam_meme/TagDetectionStruct.h
new file mode 100644
index 00000000000..6429e73effa
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/TagDetectionStruct.h
@@ -0,0 +1,58 @@
+// Copyright (c) FIRST and other WPILib contributors.
+// Open Source Software; you can modify and/or share it under the terms of
+// the WPILib BSD license file in the root directory of this project.
+
+#pragma once
+
+#include <wpi/SymbolExports.h>
+#include <wpi/struct/Struct.h>
+
+#include "TagDetection.h"
+
+template <>
+struct WPILIB_DLLEXPORT wpi::Struct<TagDetection> {
+  static constexpr std::string_view GetTypeName() { return "TagDetection"; }
+
+  static constexpr size_t GetSize() { return ((8 * 2) * 4 + 4); }
+
+  static constexpr std::string_view GetSchema() {
+    return "uint32 id;double cx1;double cy1;double cx2;double cy2;double "
+           "cx3;double cy3;double cx4;double cy4";
+  }
+
+  static TagDetection Unpack(std::span<const uint8_t> data) {
+    return TagDetection{wpi::UnpackStruct<int32_t, 0>(data),
+                        {
+                            {
+                                wpi::UnpackStruct<double, 4 + 8 * 0>(data),
+                                wpi::UnpackStruct<double, 4 + 8 * 1>(data),
+                            },
+                            {
+                                wpi::UnpackStruct<double, 4 + 8 * 2>(data),
+                                wpi::UnpackStruct<double, 4 + 8 * 3>(data),
+                            },
+                            {
+                                wpi::UnpackStruct<double, 4 + 8 * 4>(data),
+                                wpi::UnpackStruct<double, 4 + 8 * 5>(data),
+                            },
+                            {
+                                wpi::UnpackStruct<double, 4 + 8 * 6>(data),
+                                wpi::UnpackStruct<double, 4 + 8 * 7>(data),
+                            },
+                        }};
+  }
+
+  static void Pack(std::span<uint8_t> data, const TagDetection& value) {
+    wpi::PackStruct<0>(data, value.id);
+    wpi::PackStruct<4 + 8 * 0>(data, value.corners[0].x);
+    wpi::PackStruct<4 + 8 * 1>(data, value.corners[0].y);
+    wpi::PackStruct<4 + 8 * 2>(data, value.corners[1].x);
+    wpi::PackStruct<4 + 8 * 3>(data, value.corners[1].y);
+    wpi::PackStruct<4 + 8 * 4>(data, value.corners[2].x);
+    wpi::PackStruct<4 + 8 * 5>(data, value.corners[2].y);
+    wpi::PackStruct<4 + 8 * 6>(data, value.corners[3].x);
+    wpi::PackStruct<4 + 8 * 7>(data, value.corners[3].y);
+  }
+};
+
+static_assert(wpi::StructSerializable<TagDetection>);
diff --git a/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.cpp b/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.cpp
new file mode 100644
index 00000000000..80815e9f5d4
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.cpp
@@ -0,0 +1,116 @@
+// Copyright (c) FIRST and other WPILib contributors.
+// Open Source Software; you can modify and/or share it under the terms of
+// the WPILib BSD license file in the root directory of this project.
+
+#include "gtsam_tag_map.h"
+
+#include <map>
+#include <vector>
+
+#include <opencv2/calib3d.hpp>
+#include <opencv2/core/eigen.hpp>
+
+#include "pose_converters.h"
+
+using namespace gtsam;
+using namespace wpical;
+
+static std::map<int, Pose3> TagLayoutToMap(
+    const frc::AprilTagFieldLayout& layout) {
+  std::map<int, Pose3> worldTtags;
+
+  for (const frc::AprilTag& tag : layout.GetTags()) {
+    worldTtags[tag.ID] = Pose3dToGtsamPose3(tag.pose);
+  }
+
+  return worldTtags;
+}
+
+static std::vector<Point3> MakeTagModel(double width) {
+  return {
+      {0, -width / 2.0, -width / 2.0},
+      {0, width / 2.0, -width / 2.0},
+      {0, width / 2.0, width / 2.0},
+      {0, -width / 2.0, width / 2.0},
+  };
+}
+static std::vector<cv::Point3f> MakeTagModelOpenCV(double width) {
+  return {cv::Point3f(0, -width / 2.0, -width / 2.0),
+          cv::Point3f(0, +width / 2.0, -width / 2.0),
+          cv::Point3f(0, +width / 2.0, +width / 2.0),
+          cv::Point3f(0, -width / 2.0, +width / 2.0)};
+}
+
+wpical::GtsamApriltagMap::GtsamApriltagMap(
+    const frc::AprilTagFieldLayout& layout, units::meter_t tagWidth)
+    : tagToCorners{MakeTagModel(tagWidth.to<double>())},
+      tagToCornersCv{MakeTagModelOpenCV(tagWidth.to<double>())},
+      worldTtags{TagLayoutToMap(layout)} {}
+
+const std::optional<Pose3> wpical::GtsamApriltagMap::WorldToTag(
+    const int id) const {
+  if (auto it = worldTtags.find(id); it != worldTtags.end()) {
+    return it->second;
+  } else {
+    return std::nullopt;
+  }
+}
+
+const std::vector<Point3_> wpical::GtsamApriltagMap::WorldToCornersFactor(
+    const Pose3_ worldTtag) const {
+  std::vector<Point3_> out;
+  out.reserve(tagToCorners.size());
+  for (const auto& p : tagToCorners) {
+    out.push_back(transformFrom(worldTtag, p));
+  }
+
+  return out;
+}
+
+gtsam::Point2_ wpical::PredictLandmarkImageLocationFactor(
+    gtsam::Pose3_ worldTcamera_fac, gtsam::Cal3_S2_ cameraCal,
+    gtsam::Point3_ worldPcorner) {
+  // Tag corner 3D position in the camera frame
+  const Point3_ camPcorner = transformTo(worldTcamera_fac, worldPcorner);
+  // project from vector down to pinhole location, then uncalibrate to pixel
+  // locations
+  const Point2_ prediction =
+      uncalibrate<Cal3_S2>(cameraCal, project(camPcorner));
+
+  return prediction;
+}
+
+std::optional<gtsam::Pose3> wpical::EstimateWorldTCam_SingleTag(
+    TagDetection tagToUse, wpical::GtsamApriltagMap aprilTags,
+    CameraMatrix camMat) {
+  Eigen::Matrix<double, 8, 1> distCoeffs = wpical::DistortionMatrix::Zero();
+
+  const std::vector<cv::Point3f>& objectPoints = aprilTags.TagToCornersCv();
+
+  std::vector<cv::Point2f> imagePoints;
+  imagePoints.reserve(4);
+  for (const auto& corner : tagToUse.corners) {
+    imagePoints.emplace_back(corner.x, corner.y);
+  }
+
+  // eigen/cv marshalling
+  cv::Mat cameraMatCV(camMat.rows(), camMat.cols(), cv::DataType<double>::type);
+  cv::eigen2cv(camMat, cameraMatCV);
+  cv::Mat distCoeffsMatCV(distCoeffs.rows(), distCoeffs.cols(),
+                          cv::DataType<double>::type);
+  cv::eigen2cv(distCoeffs, distCoeffsMatCV);
+
+  // actually do solvepnp
+  cv::Mat rvec(3, 1, cv::DataType<double>::type);
+  cv::Mat tvec(3, 1, cv::DataType<double>::type);
+  cv::solvePnP(objectPoints, imagePoints, cameraMatCV, distCoeffsMatCV, rvec,
+               tvec);
+
+  if (const auto w2tag = aprilTags.WorldToTag(tagToUse.id)) {
+    auto camToTag = Pose3dToGtsamPose3(ToPose3d(tvec, rvec));
+    auto tag2cam = camToTag.inverse();
+    return w2tag.value().transformPoseFrom(tag2cam);
+  } else {
+    return std::nullopt;
+  }
+}
diff --git a/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.h b/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.h
new file mode 100644
index 00000000000..76af7c6e120
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.h
@@ -0,0 +1,75 @@
+// Copyright (c) FIRST and other WPILib contributors.
+// Open Source Software; you can modify and/or share it under the terms of
+// the WPILib BSD license file in the root directory of this project.
+
+#pragma once
+#include <gtsam/geometry/Cal3_S2.h>
+#include <gtsam/geometry/Pose3.h>
+#include <gtsam/slam/expressions.h>
+#include <units/length.h>
+
+#include <map>
+#include <vector>
+
+#include <frc/apriltag/AprilTagFieldLayout.h>
+#include <opencv2/core/types.hpp>
+
+#include "TagDetection.h"
+
+namespace wpical {
+using CameraMatrix = Eigen::Matrix<double, 3, 3>;
+using DistortionMatrix = Eigen::Matrix<double, 8, 1>;
+
+class GtsamApriltagMap {
+ public:
+  GtsamApriltagMap(const frc::AprilTagFieldLayout& layout,
+                   units::meter_t tagWidth);
+
+  /**
+   * The pose of tag {id} in the field map, or empty if not found.
+   */
+  const std::optional<gtsam::Pose3> WorldToTag(const int id) const;
+
+  /*
+   * List of corners mapped from 3d space (meters) to the 2d camera screen
+   * (pixels).
+   *  ⟶ +X  3 ----- 2
+   *  |      |       |
+   *  V      |       |
+   *  +Y     0 ----- 1
+   * In object space, we have this order, with origin at the center and +X out
+   * of the tag
+   */
+  inline const std::vector<cv::Point3f>& TagToCornersCv() const {
+    return tagToCornersCv;
+  }
+
+  /**
+   * A list of autodiff Expressions for the location of this tag's 4 corners in
+   * the field, given an Expression for the tag origin's pose.
+   */
+  const std::vector<gtsam::Point3_> WorldToCornersFactor(
+      const gtsam::Pose3_ worldTtag) const;
+
+ private:
+  std::vector<gtsam::Point3> tagToCorners;
+  std::vector<cv::Point3f> tagToCornersCv;
+  std::map<int, gtsam::Pose3> worldTtags;
+};
+
+/**
+ * Create an Expression for where we would see {worldPcorner} in an image
+ * captured by a camera with {cameraCal} located at {worldTcamera_fac}.
+ */
+gtsam::Point2_ PredictLandmarkImageLocationFactor(
+    gtsam::Pose3_ worldTcamera_fac, gtsam::Cal3_S2_ cameraCal,
+    gtsam::Point3_ worldPcorner);
+
+/**
+ * Generate a seed pose for our camera, given a single tag detection. Note that
+ * tag corners must be externally undistorted.
+ */
+std::optional<gtsam::Pose3> EstimateWorldTCam_SingleTag(
+    TagDetection result, GtsamApriltagMap aprilTags, CameraMatrix camMat);
+
+}  // namespace wpical
diff --git a/wpical/src/main/native/cpp/gtsam_meme/pose_converters.cpp b/wpical/src/main/native/cpp/gtsam_meme/pose_converters.cpp
new file mode 100644
index 00000000000..bfc90bcd18c
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/pose_converters.cpp
@@ -0,0 +1,44 @@
+// Copyright (c) FIRST and other WPILib contributors.
+// Open Source Software; you can modify and/or share it under the terms of
+// the WPILib BSD license file in the root directory of this project.
+
+#include "pose_converters.h"
+
+using namespace wpical;
+using namespace gtsam;
+
+Pose3 wpical::Pose3dToGtsamPose3(frc::Pose3d pose) {
+  const auto q = pose.Rotation().GetQuaternion();
+  return Pose3{Rot3(q.W(), q.X(), q.Y(), q.Z()),
+               Point3(pose.X().to<double>(), pose.Y().to<double>(),
+                      pose.Z().to<double>())};
+}
+
+frc::Pose3d wpical::GtsamToFrcPose3d(gtsam::Pose3 pose) {
+  return frc::Pose3d{
+      frc::Translation3d{units::meter_t{pose.x()}, units::meter_t{pose.y()},
+                         units::meter_t{pose.z()}},
+      frc::Rotation3d{pose.rotation().matrix()}};
+}
+
+frc::Pose3d wpical::ToPose3d(const cv::Mat& tvec, const cv::Mat& rvec) {
+  using namespace frc;
+  using namespace units;
+
+  // cv::Mat R;
+  // cv::Rodrigues(rvec, R); // R is 3x3
+  // R = R.t();                 // rotation of inverse
+  // cv::Mat tvecI = -R * tvec; // translation of inverse
+
+  Eigen::Matrix<double, 3, 1> tv;
+  tv[0] = tvec.at<double>(0, 0);
+  tv[1] = tvec.at<double>(1, 0);
+  tv[2] = tvec.at<double>(2, 0);
+  Eigen::Matrix<double, 3, 1> rv;
+  rv[0] = rvec.at<double>(0, 0);
+  rv[1] = rvec.at<double>(1, 0);
+  rv[2] = rvec.at<double>(2, 0);
+
+  return Pose3d(Translation3d(meter_t{tv[0]}, meter_t{tv[1]}, meter_t{tv[2]}),
+                Rotation3d(rv));
+}
diff --git a/wpical/src/main/native/cpp/gtsam_meme/pose_converters.h b/wpical/src/main/native/cpp/gtsam_meme/pose_converters.h
new file mode 100644
index 00000000000..a224bf37938
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/pose_converters.h
@@ -0,0 +1,16 @@
+// Copyright (c) FIRST and other WPILib contributors.
+// Open Source Software; you can modify and/or share it under the terms of
+// the WPILib BSD license file in the root directory of this project.
+
+#pragma once
+
+#include <gtsam/geometry/Pose3.h>
+
+#include <frc/geometry/Pose3d.h>
+#include <opencv2/core/mat.hpp>
+
+namespace wpical {
+gtsam::Pose3 Pose3dToGtsamPose3(frc::Pose3d pose);
+frc::Pose3d GtsamToFrcPose3d(gtsam::Pose3 pose);
+frc::Pose3d ToPose3d(const cv::Mat& tvec, const cv::Mat& rvec);
+}  // namespace wpical
diff --git a/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.cpp b/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.cpp
new file mode 100644
index 00000000000..47a4d26bf77
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.cpp
@@ -0,0 +1,218 @@
+// Copyright (c) FIRST and other WPILib contributors.
+// Open Source Software; you can modify and/or share it under the terms of
+// the WPILib BSD license file in the root directory of this project.
+
+#include "wpical_gtsam.h"
+
+#include <gtsam/geometry/Point2.h>
+#include <gtsam/geometry/Pose3.h>
+#include <gtsam/inference/Symbol.h>
+#include <gtsam/nonlinear/DoglegOptimizer.h>
+#include <gtsam/nonlinear/ExpressionFactorGraph.h>
+#include <gtsam/nonlinear/Marginals.h>
+#include <gtsam/slam/expressions.h>
+
+#include <iostream>
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/calib3d.hpp>
+#include <opencv2/core/mat.hpp>
+#include <opencv2/core/types.hpp>
+
+#include "pose_converters.h"
+
+// #define OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT
+#include <opencv2/core/eigen.hpp>
+
+using namespace wpical;
+using namespace gtsam;
+using symbol_shorthand::L;
+using symbol_shorthand::X;
+
+using CameraMatrix = Eigen::Matrix<double, 3, 3>;
+using DistortionMatrix = Eigen::Matrix<double, 8, 1>;
+
+/**
+ * Check if a given tag {id} was seen in a
+ */
+std::set<int> TagsUsed(KeyframeMap tags) {
+  std::vector<int> v;
+  for (const auto& [key, tags] : tags) {
+    for (const TagDetection& tag : tags) {
+      v.push_back(tag.id);
+    }
+  }
+  return {v.begin(), v.end()};
+}
+
+CalResult wpical::OptimizeLayout(
+    const GtsamApriltagMap& tagLayoutGuess, KeyframeMap keyframes,
+    gtsam::Cal3_S2 cal,
+    const std::map<int32_t, std::pair<gtsam::Pose3, gtsam::SharedNoiseModel>>&
+        fixedTags,
+    const gtsam::SharedNoiseModel cameraNoise) {
+  ExpressionFactorGraph graph;
+
+  // Make sure our tags are all in the map
+  for (auto& [stateKey, tags] : keyframes) {
+    tags.erase(std::remove_if(tags.begin(), tags.end(),
+                              [&](const auto& tag) {
+                                bool ret = !tagLayoutGuess.WorldToTag(tag.id);
+                                std::cout << "Checking tag " << tag.id
+                                          << " -- got " << ret << std::endl;
+                                return ret;
+                              }),
+               tags.end());
+  }
+  std::erase_if(keyframes, [](auto& kv) { return kv.second.empty(); });
+
+  // constrain fixed(ish) tags - future work can investigate partial pose priors
+  for (const auto& [tagId, info] : fixedTags) {
+    graph.addPrior(L(tagId), std::get<0>(info), std::get<1>(info));
+  }
+
+  // Add all our tag observation factors
+  for (const auto& [stateKey, tags] : keyframes) {
+    for (const TagDetection& tag : tags) {
+      std::cout << "Adding factors for tag " << tag.id << std::endl;
+      auto worldPcorners =
+          tagLayoutGuess.WorldToCornersFactor(Pose3_{L(tag.id)});
+
+      // add each tag corner
+      constexpr int NUM_CORNERS = 4;
+      for (size_t i = 0; i < NUM_CORNERS; i++) {
+        // Decision variable - where our camera is in the world
+        const Pose3_ worldTcamera_fac(stateKey);
+        // Where we'd predict the i'th corner of the tag to be
+        const auto prediction = PredictLandmarkImageLocationFactor(
+            worldTcamera_fac, cal, worldPcorners[i]);
+        // where we saw the i'th corner in the image
+        Point2 measurement = {tag.corners[i].x, tag.corners[i].y};
+        // Add this prediction/measurement pair to our graph
+        graph.addExpressionFactor(prediction, measurement, cameraNoise);
+      }
+    }
+  }
+
+  // Initial guess for our optimizer. Needs to be in the right ballpark, but
+  // accuracy doesn't super matter
+  Values initial;
+
+  // Guess for all camera poses based on tag layout JSON
+  for (const auto& [stateKey, tags] : keyframes) {
+    if (!tags.size()) {
+      std::cerr << "Can't guess pose of camera for observation " << stateKey
+                << " (no tags in observation)" << std::endl;
+      continue;
+    }
+
+    auto worldTcam_guess =
+        EstimateWorldTCam_SingleTag(*tags.begin(), tagLayoutGuess, cal.K());
+    if (!worldTcam_guess) {
+      std::cerr << "Can't guess pose of camera for observation " << stateKey
+                << std::endl;
+    } else {
+      initial.insert<Pose3>(stateKey, *worldTcam_guess);
+    }
+  }
+
+  // Guess for tag locations = tag layout json
+  for (int id : TagsUsed(keyframes)) {
+    if (auto worldTtag = tagLayoutGuess.WorldToTag(id)) {
+      initial.insert(L(id), worldTtag.value());
+    } else {
+      std::cerr << "Can't guess pose of tag " << id << std::endl;
+    }
+  }
+
+  /* Optimize the graph and print results */
+  std::cout << "==========================\ninitial error = "
+            << graph.error(initial) << std::endl;
+  auto start = std::chrono::steady_clock::now();
+
+  DoglegParams params;
+  params.verbosity = NonlinearOptimizerParams::ERROR;
+  // params.relativeErrorTol = 1e-3;
+  // params.absoluteErrorTol = 1e-3;
+
+  // Create initial optimizer
+  DoglegOptimizer optimizer{graph, initial, params};
+
+  // Run full optimization until convergence.
+  Values result;
+  try {
+    result = optimizer.optimize();
+  } catch (std::exception* e) {
+    std::cerr << e->what();
+    return {};
+  }
+
+  auto end = std::chrono::steady_clock::now();
+  auto dt = end - start;
+  uint64_t microseconds =
+      std::chrono::duration_cast<std::chrono::microseconds>(dt).count();
+
+  std::cout << "\n===== Converged in " << optimizer.iterations()
+            << " iterations (" << microseconds << " uS) with final error "
+            << optimizer.error() << " ======" << std::endl;
+
+  {
+    std::stringstream ss;
+    ss << "tag_map_" << result.size() << ".dot";
+    graph.saveGraph(ss.str(), result);
+  }
+
+  CalResult ret;
+  ret.result = result;
+
+  {
+    gtsam::Marginals marginals(graph, result);
+    std::vector<frc::AprilTag> tags;
+
+    std::cout << "Results:" << std::endl;
+    for (auto [key, value] : result) {
+      std::cout << "\n========= Key " << gtsam::Symbol(key) << " ==========\n";
+
+      // Assume all our keys are pose3 factors. lol. Print out some fun info
+      // about them
+      auto est = GtsamToFrcPose3d(result.at<gtsam::Pose3>(key));
+      fmt::println("Estimated pose:");
+      fmt::println("Translation: x={:.2f} y={:.2f} z={:.2f}", est.X(), est.Y(),
+                   est.Z());
+      fmt::println("Rotation: W={:.3f} X={:.3f} Y={:.3f} Z={:.3f}",
+                   est.Rotation().GetQuaternion().W(),
+                   est.Rotation().GetQuaternion().X(),
+                   est.Rotation().GetQuaternion().Y(),
+                   est.Rotation().GetQuaternion().Z());
+
+      // Covariance is the variance of x_i with x_i - stddev is sqrt(var)
+      gtsam::Matrix marginalCov = marginals.marginalCovariance(key);
+      auto stddev = marginalCov.diagonal().cwiseSqrt();
+
+      std::cout << "Marginal stddev (r t):" << std::endl << stddev << std::endl;
+
+      // todo - track all tag keys instead of this hack
+      if (key >= L(0) && key <= L(1000000)) {
+        int id = static_cast<int>(key - L(0));
+
+        tags.push_back(frc::AprilTag{id, est});
+        ret.tagPoseCovariances[id] = Pose3WithVariance::FromEigen(est, stddev);
+      }
+      if (key >= X(0) && key <= X(1000000)) {
+        ret.cameraPoseCovariances[key] =
+            Pose3WithVariance::FromEigen(est, stddev);
+      }
+    }
+
+    frc::AprilTagFieldLayout layout{tags, 16.541_m, 8.211_m};
+    ret.optimizedLayout = layout;
+  }
+
+  return ret;
+}
diff --git a/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.h b/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.h
new file mode 100644
index 00000000000..60ff0ef2dcc
--- /dev/null
+++ b/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.h
@@ -0,0 +1,41 @@
+// Copyright (c) FIRST and other WPILib contributors.
+// Open Source Software; you can modify and/or share it under the terms of
+// the WPILib BSD license file in the root directory of this project.
+
+#pragma once
+
+#include <gtsam/geometry/Cal3_S2.h>
+#include <gtsam/geometry/Pose3.h>
+#include <gtsam/linear/NoiseModel.h>
+#include <gtsam/nonlinear/Values.h>
+
+#include <map>
+#include <utility>
+#include <vector>
+
+#include <frc/apriltag/AprilTagFieldLayout.h>
+
+#include "Pose3WithVariance.h"
+#include "TagDetection.h"
+#include "gtsam_tag_map.h"
+
+namespace wpical {
+using KeyframeMap = std::map<gtsam::Key, std::vector<TagDetection>>;
+
+struct CalResult {
+  gtsam::Values result;
+
+  std::map<int32_t, Pose3WithVariance> tagPoseCovariances;
+  std::map<gtsam::Key, Pose3WithVariance> cameraPoseCovariances;
+
+  frc::AprilTagFieldLayout optimizedLayout;
+};
+
+// note that we expect the pixel locations to be -undistorted- here
+CalResult OptimizeLayout(
+    const GtsamApriltagMap& tagLayoutGuess, KeyframeMap keyframes,
+    gtsam::Cal3_S2 cal,
+    const std::map<int32_t, std::pair<gtsam::Pose3, gtsam::SharedNoiseModel>>&
+        fixedTags,
+    const gtsam::SharedNoiseModel cameraNoise);
+}  // namespace wpical
diff --git a/wpical/src/test/native/cpp/test_calibrate.cpp b/wpical/src/test/native/cpp/test_calibrate.cpp
index c185face6f8..b9d7bf3d715 100644
--- a/wpical/src/test/native/cpp/test_calibrate.cpp
+++ b/wpical/src/test/native/cpp/test_calibrate.cpp
@@ -26,64 +26,64 @@ const std::string videoLocation = "/altfieldvideo";
 const std::string fileSuffix = ".mp4";
 const std::string videoLocation = "/fieldvideo";
 #endif
-TEST(Camera_CalibrationTest, OpenCV_Typical) {
-  int ret = cameracalibration::calibrate(
-      projectRootPath + "/testcalibration" + fileSuffix, cameraModel, 0.709f,
-      0.551f, 12, 8, false);
-  cameracalibration::dumpJson(cameraModel,
-                              calSavePath + "/cameracalibration.json");
-  EXPECT_EQ(ret, 0);
-}
+// TEST(Camera_CalibrationTest, OpenCV_Typical) {
+//   int ret = cameracalibration::calibrate(
+//       projectRootPath + "/testcalibration" + fileSuffix, 0.709f, 0.551f, 12,
+//       8, false);
+//   EXPECT_EQ(ret, 0);
+// }
 
-TEST(Camera_CalibrationTest, OpenCV_Atypical) {
-  int ret = cameracalibration::calibrate(
-      projectRootPath + videoLocation + "/long" + fileSuffix, cameraModel,
-      0.709f, 0.551f, 12, 8, false);
-  EXPECT_EQ(ret, 1);
-}
+// TEST(Camera_CalibrationTest, OpenCV_Atypical) {
+//   int ret = cameracalibration::calibrate(
+//       projectRootPath + videoLocation + "/long" + fileSuffix, 0.709f, 0.551f,
+//       12, 8, false);
+//   EXPECT_EQ(ret, 1);
+// }
 
-TEST(Camera_CalibrationTest, MRcal_Typical) {
-  int ret = cameracalibration::calibrate(
-      projectRootPath + "/testcalibration" + fileSuffix, cameraModel, .709f, 12,
-      8, 1080, 1920, false);
-  EXPECT_EQ(ret, 0);
-}
+// TEST(Camera_CalibrationTest, MRcal_Typical) {
+//   int ret = cameracalibration::calibrate(
+//       projectRootPath + "/testcalibration" + fileSuffix, 0.709f, 12, 8, 1080,
+//       1920, false);
+//   EXPECT_EQ(ret, 0);
+// }
 
-TEST(Camera_CalibrationTest, MRcal_Atypical) {
-  int ret = cameracalibration::calibrate(
-      projectRootPath + videoLocation + "/short" + fileSuffix, cameraModel,
-      0.709f, 12, 8, 1080, 1920, false);
-  EXPECT_EQ(ret, 1);
-}
+// TEST(Camera_CalibrationTest, MRcal_Atypical) {
+//   int ret = cameracalibration::calibrate(
+//       projectRootPath + videoLocation + "/short" + fileSuffix, 0.709f, 12, 8,
+//       1080, 1920, false);
+//   EXPECT_EQ(ret, 1);
+// }
 
 TEST(Field_CalibrationTest, Typical) {
   int ret = fieldcalibration::calibrate(
-      projectRootPath + videoLocation, calSavePath,
-      calSavePath + "/cameracalibration.json",
+      "/home/matt/Videos/wpical-testing",
+      projectRootPath + "/2024-output-typical.json",
+      "/home/matt/Downloads/"
+      "photon_calibration_Arducam_OV2311_USB_Camera_1600x1200.json",
       projectRootPath + "/2024-crescendo.json", 3, false);
   EXPECT_EQ(ret, 0);
 }
 
-TEST(Field_CalibrationTest, Atypical_Bad_Camera_Model_Directory) {
-  int ret = fieldcalibration::calibrate(
-      projectRootPath + videoLocation, calSavePath,
-      projectRootPath + videoLocation + "/long" + fileSuffix,
-      projectRootPath + "/2024-crescendo.json", 3, false);
-  EXPECT_EQ(ret, 1);
-}
+// TEST(Field_CalibrationTest, Atypical_Bad_Camera_Model_Directory) {
+//   int ret = fieldcalibration::calibrate(
+//       projectRootPath + videoLocation, projectRootPath + "",
+//       projectRootPath + videoLocation + "/long" + fileSuffix,
+//       projectRootPath + "/2024-crescendo.json", 3, false);
+//   EXPECT_EQ(ret, 1);
+// }
 
-TEST(Field_CalibrationTest, Atypical_Bad_Ideal_JSON) {
-  int ret = fieldcalibration::calibrate(
-      projectRootPath + videoLocation, calSavePath,
-      calSavePath + "/cameracalibration.json",
-      calSavePath + "/cameracalibration.json", 3, false);
-  EXPECT_EQ(ret, 1);
-}
+// TEST(Field_CalibrationTest, Atypical_Bad_Ideal_JSON) {
+//   int ret = fieldcalibration::calibrate(
+//       projectRootPath + videoLocation, projectRootPath + "",
+//       projectRootPath + "/camera_calibration.json",
+//       projectRootPath + "/camera_calibration.json", 3, false);
+//   EXPECT_EQ(ret, 1);
+// }
 
-TEST(Field_CalibrationTest, Atypical_Bad_Input_Directory) {
-  int ret = fieldcalibration::calibrate(
-      projectRootPath + "", calSavePath,
-      calSavePath + "/cameracalibration.json",
-      projectRootPath + "/2024-crescendo.json", 3, false);
-  EXPECT_EQ(ret, 1);
-}
+// TEST(Field_CalibrationTest, Atypical_Bad_Input_Directory) {
+//   int ret = fieldcalibration::calibrate(
+//       projectRootPath + "", projectRootPath + "",
+//       projectRootPath + "/camera_calibration.json",
+//       projectRootPath + "/2024-crescendo.json", 3, false);
+//   EXPECT_EQ(ret, 1);
+// }

From 266e4c247acb91ff246861b332c5d016238524bb Mon Sep 17 00:00:00 2001
From: Matt <matthew.morley.ca@gmail.com>
Date: Sun, 29 Dec 2024 16:37:27 -0800
Subject: [PATCH 3/4] whoops

---
 wpical/src/main/native/cpp/fieldcalibration.cpp | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/wpical/src/main/native/cpp/fieldcalibration.cpp b/wpical/src/main/native/cpp/fieldcalibration.cpp
index e646f9a1a0f..6b924dd2d76 100644
--- a/wpical/src/main/native/cpp/fieldcalibration.cpp
+++ b/wpical/src/main/native/cpp/fieldcalibration.cpp
@@ -29,13 +29,9 @@
 #include "apriltag.h"
 #include "gtsam_meme/wpical_gtsam.h"
 #include "tag36h11.h"
+#include "cameracalibration.h"
 
-struct CameraModel {
-  Eigen::Matrix<double, 3, 3> intrinsic_matrix;
-  Eigen::Matrix<double, 8, 1> distortion_coefficients;
-};
-
-inline CameraModel load_camera_model(std::string path) {
+inline cameracalibration::CameraModel load_camera_model(std::string path) {
   Eigen::Matrix<double, 3, 3> camera_matrix;
   Eigen::Matrix<double, 8, 1> camera_distortion;
 
@@ -90,11 +86,11 @@ inline CameraModel load_camera_model(std::string path) {
     }
   }
 
-  CameraModel camera_model{camera_matrix, camera_distortion};
+  cameracalibration::CameraModel camera_model{camera_matrix, camera_distortion, -1};
   return camera_model;
 }
 
-inline CameraModel load_camera_model(wpi::json json_data) {
+inline cameracalibration::CameraModel load_camera_model(wpi::json json_data) {
   // Camera matrix
   Eigen::Matrix<double, 3, 3> camera_matrix;
 
@@ -115,7 +111,7 @@ inline CameraModel load_camera_model(wpi::json json_data) {
     }
   }
 
-  CameraModel camera_model{camera_matrix, camera_distortion};
+  cameracalibration::CameraModel camera_model{camera_matrix, camera_distortion};
   return camera_model;
 }
 
@@ -380,10 +376,9 @@ int fieldcalibration::calibrate(std::string input_dir_path,
   auto calResult = wpical::OptimizeLayout(layoutGuess, outputMap, gtsam_cal,
                                           fixedTags, cameraNoise);
 
+  // Convert the output AprilTagFieldLayout to a json
   wpi::json observed_map_json = calResult.optimizedLayout;
 
-  // TODO add my tags to the output map
-
   std::ofstream output_file(output_file_path);
   output_file << observed_map_json.dump(4) << std::endl;
 

From 41dc097d9b4c4e5786406e5f4990f00b443d398f Mon Sep 17 00:00:00 2001
From: Jade Turner <spacey-sooty@proton.me>
Date: Mon, 30 Dec 2024 14:06:15 +0800
Subject: [PATCH 4/4] start on cmake for gtsam

Signed-off-by: Jade Turner <spacey-sooty@proton.me>
---
 CMakeLists.txt        |  3 ++-
 build.gradle          |  3 ---
 shared/gtsam.gradle   |  2 +-
 wpical/CMakeLists.txt | 34 +++++++++++++++++++++++++++++++---
 4 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9448fc87bfa..90bd3b32f28 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,7 +145,8 @@ if(WITH_DOCS)
 endif()
 
 if(WITH_WPICAL)
-    find_package(Ceres CONFIG REQUIRED)
+    # find_package(Ceres CONFIG REQUIRED)
+    find_package (SuiteSparse 5.12 NO_MODULE)
 endif()
 
 find_package(LIBSSH CONFIG 0.7.1)
diff --git a/build.gradle b/build.gradle
index b1c06fef465..e038893ce01 100644
--- a/build.gradle
+++ b/build.gradle
@@ -31,9 +31,6 @@ allprojects {
         maven {
             url = 'https://frcmaven.wpi.edu/artifactory/ex-mvn'
         }
-        maven {
-            url = "https://maven.photonvision.org/snapshots"
-        }
     }
     if (project.hasProperty('releaseMode')) {
         wpilibRepositories.addAllReleaseRepositories(it)
diff --git a/shared/gtsam.gradle b/shared/gtsam.gradle
index 8912f4b5bde..b4d582b9de2 100644
--- a/shared/gtsam.gradle
+++ b/shared/gtsam.gradle
@@ -6,7 +6,7 @@ nativeUtils {
             headerClassifier = "headers"
             sourceClassifier = "sources"
             ext = "zip"
-            version = '4.3-1'
+            version = '4.3-2'
             targetPlatforms.addAll(nativeUtils.wpi.platforms.desktopPlatforms)
         }
     }
diff --git a/wpical/CMakeLists.txt b/wpical/CMakeLists.txt
index 7567d22708e..b797e3a9e02 100644
--- a/wpical/CMakeLists.txt
+++ b/wpical/CMakeLists.txt
@@ -4,6 +4,9 @@ include(CompileWarnings)
 include(GenResources)
 include(LinkMacOSGUI)
 include(AddTest)
+include(FetchContent)
+
+find_package (SuiteSparse 5.12 NO_MODULE)
 
 configure_file(src/main/generate/WPILibVersion.cpp.in WPILibVersion.cpp)
 generate_resources(src/main/native/resources generated/main/cpp WPIcal wpical wpical_resources_src)
@@ -25,6 +28,26 @@ elseif(APPLE)
     set_source_files_properties(${APP_ICON_MACOSX} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources")
 endif()
 
+set(GTSAM_COMPILE_OPTIONS_PRIVATE_DEBUG CACHE INTERNAL FORCE)
+set(GTSAM_COMPILE_OPTIONS_PRIVATE_RELWITHDEBINFO CACHE INTERNAL FORCE)
+set(GTSAM_ENABLE_BOOST_SERIALIZATION OFF)
+set(GTSAM_WITH_TBB OFF)
+set(GTSAM_USE_BOOST_FEATURES OFF)
+set(GTSAM_BUILD_PYTHON OFF)
+set(GTSAM_BUILD_EXAMPLES_ALWAYS OFF)
+set(GTSAM_BUILD_TESTS OFF)
+set(BUILD_SHARED_LIBS OFF)
+set(GTSAM_FORCE_SHARED_LIB OFF)
+set(GTSAM_FORCE_STATIC_LIB ON)
+
+include(FetchContent)
+fetchcontent_declare(
+    gtsam
+    GIT_REPOSITORY    https://github.com/mcm001/gtsam
+    GIT_TAG           c24ca7951f1db4cfe4e23f368b2eda503de72176
+)
+fetchcontent_makeavailable(gtsam)
+
 add_executable(
     wpical
     ${wpical_src}
@@ -38,7 +61,9 @@ wpilib_target_warnings(wpical)
 target_include_directories(
     wpical
     PRIVATE
+        /usr/local/include/
         src/main/native/include
+        src/main/native/thirdparty/mrcal/generated
         src/main/native/thirdparty/libdogleg/include
         src/main/native/thirdparty/mrcal/include
         src/main/native/thirdparty/mrcal_java/include
@@ -82,15 +107,16 @@ endif()
 target_compile_options(wpical PRIVATE ${compile_flags})
 
 find_package(OpenCV REQUIRED)
-find_package(Ceres CONFIG REQUIRED)
 target_link_libraries(
     wpical
+    gtsam
     apriltag
     ${OpenCV_LIBS}
     wpigui
     wpiutil
     wpimath
-    Ceres::ceres
+    SuiteSparse::CHOLMOD
+    # Ceres::ceres
 )
 
 if(WIN32)
@@ -120,11 +146,13 @@ if(WITH_TESTS)
     )
     target_link_libraries(
         wpical_test
+        gtsam
         googletest
         apriltag
         ${OpenCV_LIBS}
         wpigui
         wpiutil
-        Ceres::ceres
+        SuiteSparse::CHOLMOD
+        # Ceres::ceres
     )
 endif()