From 35476da35777360cf988b707bf9f9005e7b037d5 Mon Sep 17 00:00:00 2001 From: Matt Date: Sun, 29 Dec 2024 09:24:01 -0800 Subject: [PATCH 1/4] Add extra eigen headers for gtsam --- upstream_utils/eigen.py | 3 +- .../thirdparty/eigen/include/Eigen/Dense | 7 + .../include/unsupported/Eigen/CXX11/Tensor | 142 ++ .../unsupported/Eigen/CXX11/TensorSymmetry | 40 + .../CXX11/src/Tensor/InternalHeaderCheck.h | 3 + .../Eigen/CXX11/src/Tensor/Tensor.h | 382 ++++ .../Eigen/CXX11/src/Tensor/TensorArgMax.h | 284 +++ .../Eigen/CXX11/src/Tensor/TensorAssign.h | 213 ++ .../Eigen/CXX11/src/Tensor/TensorBase.h | 1244 ++++++++++ .../Eigen/CXX11/src/Tensor/TensorBlock.h | 1474 ++++++++++++ .../CXX11/src/Tensor/TensorBroadcasting.h | 1001 ++++++++ .../Eigen/CXX11/src/Tensor/TensorChipping.h | 474 ++++ .../CXX11/src/Tensor/TensorConcatenation.h | 354 +++ .../CXX11/src/Tensor/TensorContraction.h | 962 ++++++++ .../src/Tensor/TensorContractionBlocking.h | 69 + .../CXX11/src/Tensor/TensorContractionCuda.h | 7 + .../CXX11/src/Tensor/TensorContractionGpu.h | 1387 +++++++++++ .../src/Tensor/TensorContractionMapper.h | 533 +++++ .../CXX11/src/Tensor/TensorContractionSycl.h | 1654 ++++++++++++++ .../src/Tensor/TensorContractionThreadPool.h | 1552 +++++++++++++ .../Eigen/CXX11/src/Tensor/TensorConversion.h | 416 ++++ .../CXX11/src/Tensor/TensorConvolution.h | 1123 +++++++++ .../CXX11/src/Tensor/TensorConvolutionSycl.h | 546 +++++ .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 190 ++ .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 309 +++ .../Eigen/CXX11/src/Tensor/TensorDevice.h | 139 ++ .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 7 + .../CXX11/src/Tensor/TensorDeviceDefault.h | 113 + .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 395 ++++ .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 567 +++++ .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 376 +++ .../CXX11/src/Tensor/TensorDimensionList.h | 119 + .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 329 +++ .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 196 ++ .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 859 +++++++ .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 672 ++++++ .../Eigen/CXX11/src/Tensor/TensorExpr.h | 329 +++ .../Eigen/CXX11/src/Tensor/TensorFFT.h | 674 ++++++ .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 226 ++ .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 233 ++ .../src/Tensor/TensorForwardDeclarations.h | 215 ++ .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 422 ++++ .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 271 +++ .../CXX11/src/Tensor/TensorGlobalFunctions.h | 36 + .../src/Tensor/TensorGpuHipCudaDefines.h | 101 + .../src/Tensor/TensorGpuHipCudaUndefines.h | 45 + .../Eigen/CXX11/src/Tensor/TensorIO.h | 413 ++++ .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 590 +++++ .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 620 +++++ .../Eigen/CXX11/src/Tensor/TensorInflation.h | 226 ++ .../CXX11/src/Tensor/TensorInitializer.h | 78 + .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 261 +++ .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 185 ++ .../Eigen/CXX11/src/Tensor/TensorMacros.h | 85 + .../Eigen/CXX11/src/Tensor/TensorMap.h | 191 ++ .../Eigen/CXX11/src/Tensor/TensorMeta.h | 292 +++ .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 984 ++++++++ .../Eigen/CXX11/src/Tensor/TensorPadding.h | 620 +++++ .../Eigen/CXX11/src/Tensor/TensorPatch.h | 260 +++ .../Eigen/CXX11/src/Tensor/TensorRandom.h | 309 +++ .../Eigen/CXX11/src/Tensor/TensorReduction.h | 1038 +++++++++ .../CXX11/src/Tensor/TensorReductionGpu.h | 958 ++++++++ .../CXX11/src/Tensor/TensorReductionSycl.h | 588 +++++ .../Eigen/CXX11/src/Tensor/TensorRef.h | 317 +++ .../Eigen/CXX11/src/Tensor/TensorReverse.h | 410 ++++ .../Eigen/CXX11/src/Tensor/TensorRoll.h | 361 +++ .../Eigen/CXX11/src/Tensor/TensorScan.h | 474 ++++ .../Eigen/CXX11/src/Tensor/TensorScanSycl.h | 506 ++++ .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 415 ++++ .../Eigen/CXX11/src/Tensor/TensorStorage.h | 144 ++ .../Eigen/CXX11/src/Tensor/TensorStriding.h | 316 +++ .../Eigen/CXX11/src/Tensor/TensorTrace.h | 278 +++ .../Eigen/CXX11/src/Tensor/TensorTraits.h | 231 ++ .../Eigen/CXX11/src/Tensor/TensorUInt128.h | 229 ++ .../CXX11/src/Tensor/TensorVolumePatch.h | 619 +++++ .../src/TensorSymmetry/DynamicSymmetry.h | 296 +++ .../src/TensorSymmetry/InternalHeaderCheck.h | 4 + .../CXX11/src/TensorSymmetry/StaticSymmetry.h | 223 ++ .../Eigen/CXX11/src/TensorSymmetry/Symmetry.h | 335 +++ .../TensorSymmetry/util/TemplateGroupTheory.h | 492 ++++ .../unsupported/Eigen/SpecialFunctions | 104 + .../KroneckerProduct/KroneckerTensorProduct.h | 284 +++ .../BesselFunctionsArrayAPI.h | 276 +++ .../BesselFunctionsBFloat16.h | 71 + .../BesselFunctionsFunctors.h | 323 +++ .../SpecialFunctions/BesselFunctionsHalf.h | 69 + .../SpecialFunctions/BesselFunctionsImpl.h | 1638 +++++++++++++ .../BesselFunctionsPacketMath.h | 108 + .../SpecialFunctions/HipVectorCompatibility.h | 71 + .../SpecialFunctions/InternalHeaderCheck.h | 4 + .../SpecialFunctionsArrayAPI.h | 159 ++ .../SpecialFunctionsBFloat16.h | 73 + .../SpecialFunctionsFunctors.h | 326 +++ .../SpecialFunctions/SpecialFunctionsHalf.h | 73 + .../SpecialFunctions/SpecialFunctionsImpl.h | 2030 +++++++++++++++++ .../SpecialFunctionsPacketMath.h | 112 + .../arch/AVX/BesselFunctions.h | 46 + .../arch/AVX/SpecialFunctions.h | 16 + .../arch/AVX512/BesselFunctions.h | 46 + .../arch/AVX512/SpecialFunctions.h | 16 + .../arch/GPU/SpecialFunctions.h | 317 +++ .../arch/NEON/BesselFunctions.h | 54 + .../arch/NEON/SpecialFunctions.h | 34 + 103 files changed, 40290 insertions(+), 1 deletion(-) create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/SpecialFunctions create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/InternalHeaderCheck.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h create mode 100644 wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h diff --git a/upstream_utils/eigen.py b/upstream_utils/eigen.py index 875c6dfbbae..0974b1b6b56 100755 --- a/upstream_utils/eigen.py +++ b/upstream_utils/eigen.py @@ -49,6 +49,7 @@ def eigen_inclusions(dp, f): modules = [ "Cholesky", "Core", + "Dense", "Eigenvalues", "Geometry", "Householder", @@ -86,7 +87,7 @@ def unsupported_inclusions(dp, f): return False # Include the MatrixFunctions module - return "MatrixFunctions" in abspath + return "MatrixFunctions" in abspath or "SpecialFunctions" in abspath or "Tensor" in abspath or abspath.endswith("SpecialFunctionsFunctors.h") def copy_upstream_src(wpilib_root): diff --git a/wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense b/wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense new file mode 100644 index 00000000000..5768910bd88 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense @@ -0,0 +1,7 @@ +#include "Core" +#include "LU" +#include "Cholesky" +#include "QR" +#include "SVD" +#include "Geometry" +#include "Eigenvalues" diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor new file mode 100644 index 00000000000..6ac89f1b67e --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor @@ -0,0 +1,142 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// #ifndef EIGEN_CXX11_TENSOR_MODULE_H +#define EIGEN_CXX11_TENSOR_MODULE_H + +#include "../../../Eigen/Core" + +#include "../SpecialFunctions" + +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" + +// IWYU pragma: begin_exports +#include "../../../Eigen/src/Core/util/Meta.h" +#include "../../../Eigen/src/Core/util/MaxSizeVector.h" +// IWYU pragma: end_exports + +/** \defgroup CXX11_Tensor_Module Tensor Module + * + * This module provides a Tensor class for storing arbitrarily indexed + * objects. + * + * \code + * #include + * \endcode + * + * Much of the documentation can be found \ref eigen_tensors "here". + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) +// #include "ThreadPool" +#endif + +#ifdef EIGEN_USE_GPU +#include +#if defined(EIGEN_USE_HIP) +#include +#else +#include +#endif +#endif + +// IWYU pragma: begin_exports +#include "src/Tensor/TensorMacros.h" +#include "src/Tensor/TensorForwardDeclarations.h" +#include "src/Tensor/TensorMeta.h" +#include "src/Tensor/TensorFunctors.h" +#include "src/Tensor/TensorCostModel.h" +#include "src/Tensor/TensorDeviceDefault.h" +#include "src/Tensor/TensorDeviceThreadPool.h" +#include "src/Tensor/TensorDeviceGpu.h" +#ifndef gpu_assert +#define gpu_assert(x) +#endif +#include "src/Tensor/TensorDeviceSycl.h" +#include "src/Tensor/TensorIndexList.h" +#include "src/Tensor/TensorDimensionList.h" +#include "src/Tensor/TensorDimensions.h" +#include "src/Tensor/TensorInitializer.h" +#include "src/Tensor/TensorTraits.h" +#include "src/Tensor/TensorRandom.h" +#include "src/Tensor/TensorUInt128.h" +#include "src/Tensor/TensorIntDiv.h" +#include "src/Tensor/TensorGlobalFunctions.h" + +#include "src/Tensor/TensorIO.h" + +#include "src/Tensor/TensorBase.h" +#include "src/Tensor/TensorBlock.h" + +#include "src/Tensor/TensorEvaluator.h" +#include "src/Tensor/TensorExpr.h" +#include "src/Tensor/TensorReduction.h" +#include "src/Tensor/TensorReductionGpu.h" +#include "src/Tensor/TensorArgMax.h" +#include "src/Tensor/TensorConcatenation.h" +#include "src/Tensor/TensorContractionMapper.h" +#include "src/Tensor/TensorContractionBlocking.h" +#include "src/Tensor/TensorContraction.h" +#include "src/Tensor/TensorContractionThreadPool.h" +#include "src/Tensor/TensorContractionGpu.h" +#include "src/Tensor/TensorConversion.h" +#include "src/Tensor/TensorConvolution.h" +#include "src/Tensor/TensorFFT.h" +#include "src/Tensor/TensorPatch.h" +#include "src/Tensor/TensorImagePatch.h" +#include "src/Tensor/TensorVolumePatch.h" +#include "src/Tensor/TensorBroadcasting.h" +#include "src/Tensor/TensorChipping.h" +#include "src/Tensor/TensorInflation.h" +#include "src/Tensor/TensorLayoutSwap.h" +#include "src/Tensor/TensorMorphing.h" +#include "src/Tensor/TensorPadding.h" +#include "src/Tensor/TensorReverse.h" +#include "src/Tensor/TensorRoll.h" +#include "src/Tensor/TensorShuffling.h" +#include "src/Tensor/TensorStriding.h" +#include "src/Tensor/TensorCustomOp.h" +#include "src/Tensor/TensorEvalTo.h" +#include "src/Tensor/TensorForcedEval.h" +#include "src/Tensor/TensorGenerator.h" +#include "src/Tensor/TensorAssign.h" +#include "src/Tensor/TensorScan.h" +#include "src/Tensor/TensorTrace.h" + +#ifdef EIGEN_USE_SYCL +#include "src/Tensor/TensorReductionSycl.h" +#include "src/Tensor/TensorConvolutionSycl.h" +#include "src/Tensor/TensorContractionSycl.h" +#include "src/Tensor/TensorScanSycl.h" +#endif + +#include "src/Tensor/TensorExecutor.h" +#include "src/Tensor/TensorDevice.h" + +#include "src/Tensor/TensorStorage.h" +#include "src/Tensor/Tensor.h" +#include "src/Tensor/TensorFixedSize.h" +#include "src/Tensor/TensorMap.h" +#include "src/Tensor/TensorRef.h" +// IWYU pragma: end_exports + +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +// #endif // EIGEN_CXX11_TENSOR_MODULE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry new file mode 100644 index 00000000000..d4a44d0a704 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry @@ -0,0 +1,40 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H +#define EIGEN_CXX11_TENSORSYMMETRY_MODULE_H + +#include "Tensor" + +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" + +// #include "src/util/CXX11Meta.h" + +/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module + * + * This module provides a classes that allow for the definition of + * symmetries w.r.t. tensor indices. + * + * Including this module will implicitly include the Tensor module. + * + * \code + * #include + * \endcode + */ + +// IWYU pragma: begin_exports +#include "src/TensorSymmetry/util/TemplateGroupTheory.h" +#include "src/TensorSymmetry/Symmetry.h" +#include "src/TensorSymmetry/StaticSymmetry.h" +#include "src/TensorSymmetry/DynamicSymmetry.h" +// IWYU pragma: end_exports + +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h new file mode 100644 index 00000000000..9e4c1ed9c37 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_CXX11_TENSOR_MODULE_H +#error "Please include unsupported/Eigen/CXX11/Tensor instead of including headers inside the src directory directly." +#endif diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h new file mode 100644 index 00000000000..9dc95916c0c --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -0,0 +1,382 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class Tensor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor class. + * + * The %Tensor class is the work-horse for all \em dense tensors within Eigen. + * + * The %Tensor class encompasses only dynamic-size objects so far. + * + * The first two template parameters are required: + * \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex`. + * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). + * \tparam NumIndices_ Number of indices (i.e. rank of the tensor) + * + * The remaining template parameters are optional -- in most cases you don't have to worry about them. + * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either + * \b #AutoAlign or \b #DontAlign. + * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter + * controls alignment, which is required for vectorization. It defaults to aligning tensors. Note that tensors currently + * do not support any operations that profit from vectorization. Support for such operations (i.e. adding two tensors + * etc.) is planned. + * + * You can access elements of tensors using normal subscripting: + * + * \code + * Eigen::Tensor t(10, 10, 10, 10); + * t(0, 1, 2, 3) = 42.0; + * \endcode + * + * This class can be extended with the help of the plugin mechanism described on the page + * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN, + * \c EIGEN_TENSORBASE_PLUGIN, and \c EIGEN_READONLY_TENSORBASE_PLUGIN. + * + * Some notes: + * + *
+ *
Relation to other parts of Eigen:
+ *
The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that + * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code + * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the + * %Tensor class does not provide any of these features and is only available as a stand-alone class that just allows + * for coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to + * change dramatically.
+ *
+ * + * \ref TopicStorageOrders + */ + +template +class Tensor : public TensorBase > { + public: + typedef Tensor Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + enum { IsAligned = (EIGEN_MAX_ALIGN_BYTES > 0) && !(Options_ & DontAlign), CoordAccess = true, RawAccess = true }; + + static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + static constexpr int Options = Options_; + static constexpr int NumIndices = NumIndices_; + typedef DSizes Dimensions; + + protected: + TensorStorage m_storage; + + template + struct isOfNormalIndex { + static const bool is_array = internal::is_base_of, CustomIndices>::value; + static const bool is_int = NumTraits::IsInteger; + static const bool value = is_array | is_int; + }; + + public: + // Metadata + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, + IndexTypes... otherIndices) const { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, secondIndex, otherIndices...}}); + } + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + // custom indices + template ::value))> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const { + return coeff(internal::customIndices2Array(indices)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + template + inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, secondIndex, otherIndices...}}); + } + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + // custom indices + template ::value))> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices) { + return coeffRef(internal::customIndices2Array(indices)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + template + inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); + } + + // custom indices + template ::value))> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const { + return coeff(internal::customIndices2Array(indices)); + } + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const { + return coeff(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + + template + inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, secondIndex, otherIndices...}}); + } + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) { + return coeffRef(indices); + } + + // custom indices + template ::value))> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices) { + return coeffRef(internal::customIndices2Array(indices)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor() : m_storage() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const Self& other) : Base(other), m_storage(other.m_storage) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) + : m_storage(firstDimension, otherDimensions...) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + /** Normal Dimension */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array& dimensions) + : m_storage(internal::array_prod(dimensions), dimensions) { + EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const TensorBase& other) { + EIGEN_STATIC_ASSERT(OtherDerived::NumDimensions == Base::NumDimensions, Number_of_dimensions_must_match) + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const TensorBase& other) { + EIGEN_STATIC_ASSERT(OtherDerived::NumDimensions == Base::NumDimensions, Number_of_dimensions_must_match) + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Self&& other) : m_storage(std::move(other.m_storage)) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(Self&& other) { + m_storage = std::move(other.m_storage); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC void resize(Index firstDimension, IndexTypes... otherDimensions) { + // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array{{firstDimension, otherDimensions...}}); + } + + /** Normal Dimension */ + EIGEN_DEVICE_FUNC void resize(const array& dimensions) { +#ifndef EIGEN_NO_DEBUG + Index size = Index(1); + for (int i = 0; i < NumIndices; i++) { + internal::check_rows_cols_for_overflow::run(size, dimensions[i]); + size *= dimensions[i]; + } +#else + Index size = internal::array_prod(dimensions); +#endif + +#ifdef EIGEN_INITIALIZE_COEFFS + bool size_changed = size != this->size(); + m_storage.resize(size, dimensions); + if (size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED +#else + m_storage.resize(size, dimensions); +#endif + } + + EIGEN_DEVICE_FUNC void resize() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + // Nothing to do: rank 0 tensors have fixed size + } + + template + EIGEN_DEVICE_FUNC void resize(const Eigen::IndexList& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } + + /** Custom Dimension */ + template ::value))> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions) { + resize(internal::customIndices2Array(dimensions)); + } + + template + EIGEN_DEVICE_FUNC void resize(const Sizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } + +#ifdef EIGEN_TENSOR_PLUGIN +#include EIGEN_TENSOR_PLUGIN +#endif + + protected: + bool checkIndexRange(const array& indices) const { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::lesser_op; + using internal::logical_and_op; + + return + // check whether the indices are all >= 0 + array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const { + if (Options & RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h new file mode 100644 index 00000000000..f272354c4c5 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -0,0 +1,284 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H +#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/** \class TensorIndexPair + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor + Index Pair class. + * + * + */ +template +struct traits> : public traits { + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Pair Scalar; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorIndexPairOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval>::type> { + typedef TensorIndexPairOp type; +}; + +} // end namespace internal + +template +class TensorIndexPairOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef Pair CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexPairOp(const XprType& expr) : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorIndexPairOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + typedef typename TensorEvaluator::Dimensions Dimensions; + static constexpr int NumDims = internal::array_size::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + static constexpr int Layout = TensorEvaluator::Layout; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return CoeffReturnType(index, m_impl.coeff(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + protected: + TensorEvaluator m_impl; +}; + +namespace internal { + +/** \class TensorPairIndex + * \ingroup CXX11_Tensor_Module + * + * \brief Converts to Tensor > and reduces to Tensor. + * + */ +template +struct traits> : public traits { + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Index Scalar; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions - array_size::value; + static constexpr int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorPairReducerOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorPairReducerOp type; +}; + +} // end namespace internal + +template +class TensorPairReducerOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef Index CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPairReducerOp(const XprType& expr, const ReduceOp& reduce_op, + const Index return_dim, const Dims& reduce_dims) + : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC const ReduceOp& reduce_op() const { return m_reduce_op; } + + EIGEN_DEVICE_FUNC const Dims& reduce_dims() const { return m_reduce_dims; } + + EIGEN_DEVICE_FUNC Index return_dim() const { return m_return_dim; } + + protected: + typename XprType::Nested m_xpr; + const ReduceOp m_reduce_op; + const Index m_return_dim; + const Dims m_reduce_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorPairReducerOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename TensorIndexPairOp::CoeffReturnType PairType; + typedef typename TensorEvaluator>, + Device>::Dimensions Dimensions; + typedef typename TensorEvaluator, Device>::Dimensions InputDimensions; + static constexpr int NumDims = internal::array_size::value; + typedef array StrideDims; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory PairStorageMem; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + static constexpr int Layout = + TensorEvaluator>, Device>::Layout; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_orig_impl(op.expression(), device), + m_impl(op.expression().index_pairs().reduce(op.reduce_dims(), op.reduce_op()), device), + m_return_dim(op.return_dim()) { + gen_strides(m_orig_impl.dimensions(), m_strides); + if (Layout == static_cast(ColMajor)) { + const Index total_size = internal::array_prod(m_orig_impl.dimensions()); + m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size; + } else { + const Index total_size = internal::array_prod(m_orig_impl.dimensions()); + m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; + } + // If m_return_dim is not a valid index, returns 1 or this can crash on Windows. + m_stride_div = + ((m_return_dim >= 0) && (m_return_dim < static_cast(m_strides.size()))) ? m_strides[m_return_dim] : 1; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const PairType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = + 1.0 + (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost() + TensorOpCost::DivCost())); + return m_orig_impl.costPerCoeff(vectorized) + m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); + } + + private: + EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) { + if (m_return_dim < 0) { + return; // Won't be using the strides. + } + eigen_assert(m_return_dim < NumDims && "Asking to convert index to a dimension outside of the rank"); + + // Calculate m_stride_div and m_stride_mod, which are used to + // calculate the value of an index w.r.t. the m_return_dim. + if (Layout == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * dims[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[i + 1]; + } + } + } + + protected: + TensorEvaluator, Device> m_orig_impl; + TensorEvaluator>, Device> m_impl; + const Index m_return_dim; + StrideDims m_strides; + Index m_stride_mod; + Index m_stride_div; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h new file mode 100644 index 00000000000..8a57576b7ba --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -0,0 +1,213 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorAssign + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor assignment class. + * + * This class is represents the assignment of the values resulting from the evaluation of + * the rhs expression to the memory locations denoted by the lhs expression. + */ +namespace internal { +template +struct traits > { + typedef typename LhsXprType::Scalar Scalar; + typedef typename traits::StorageKind StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + static constexpr std::size_t NumDimensions = internal::traits::NumDimensions; + static constexpr int Layout = internal::traits::Layout; + typedef typename traits::PointerType PointerType; + + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorAssignOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorAssignOp type; +}; + +} // end namespace internal + +template +class TensorAssignOp : public TensorBase > { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename LhsXprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + static constexpr int NumDims = Eigen::internal::traits::NumDimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC internal::remove_all_t& lhsExpression() const { + return *((internal::remove_all_t*)&m_lhs_xpr); + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& rhsExpression() const { + return m_rhs_xpr; + } + + protected: + internal::remove_all_t& m_lhs_xpr; + const internal::remove_all_t& m_rhs_xpr; +}; + +template +struct TensorEvaluator, Device> { + typedef TensorAssignOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int PacketSize = PacketType::size; + static constexpr int NumDims = XprType::NumDims; + static constexpr int Layout = TensorEvaluator::Layout; + + enum { + IsAligned = + int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), + RawAccess = TensorEvaluator::RawAccess + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock RightTensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { + // The dimensions of the lhs and the rhs tensors should be equal to prevent + // overflows and ensure the result is fully initialized. + // TODO: use left impl instead if right impl dimensions are known at compile time. + return m_rightImpl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + m_leftImpl.evalSubExprsIfNeeded(NULL); + // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non + // null value), attempt to evaluate the rhs expression in place. Returns true iff in place + // evaluation isn't supported and the caller still needs to manually assign the values generated + // by the rhs to the lhs. + return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(m_leftImpl.data(), [done](bool need_assign) { done(need_assign); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const { + m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const { + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_leftImpl.coeff(index); } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return m_leftImpl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here, but reduce left + // cost by one load because we are using m_leftImpl.coeffRef. + TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); + return m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), left.bytes_stored(), + left.compute_cycles()) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::merge(m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlockDesc& desc, TensorBlockScratch& scratch) { + if (TensorEvaluator::RawAccess && m_leftImpl.data() != NULL) { + // If destination has raw data access, we pass it as a potential + // destination for a block descriptor evaluation. + desc.template AddDestinationBuffer( + /*dst_base=*/m_leftImpl.data() + desc.offset(), + /*dst_strides=*/internal::strides(m_leftImpl.dimensions())); + } + + RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true); + // If block was evaluated into a destination, there is no need to do assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + m_leftImpl.writeBlock(desc, block); + } + block.cleanup(); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); } + + private: + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h new file mode 100644 index 00000000000..fc3f3b78a84 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -0,0 +1,1244 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H +#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H + +// clang-format off + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorBase + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor base class. + * + * This class is the common parent of the Tensor and TensorMap class, thus + * making it possible to use either class interchangeably in expressions. + */ +#ifndef EIGEN_PARSED_BY_DOXYGEN +// FIXME Doxygen does not like the inheritance with different template parameters +// Since there is no doxygen documentation inside, we disable it for now +template +class TensorBase +{ + public: + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef std::remove_const_t CoeffReturnType; + static constexpr int NumDimensions = DerivedTraits::NumDimensions; + + // Generic nullary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + nullaryExpr(const CustomNullaryOp& func) const { + return TensorCwiseNullaryOp(derived(), func); + } + + // Coefficient-wise nullary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + constant(const Scalar& value) const { + return nullaryExpr(internal::scalar_constant_op(value)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + random() const { + return nullaryExpr(internal::UniformRandomGenerator()); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + random(const RandomGenerator& gen = RandomGenerator()) const { + return nullaryExpr(gen); + } + + // Tensor generation + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorGeneratorOp + generate(const Generator& generator) const { + return TensorGeneratorOp(derived(), generator); + } + + // Generic unary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp(derived(), func); + } + + // Coefficient-wise unary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator-() const { + return unaryExpr(internal::scalar_opposite_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sqrt() const { + return unaryExpr(internal::scalar_sqrt_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sign() const { + return unaryExpr(internal::scalar_sign_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + rsqrt() const { + return unaryExpr(internal::scalar_rsqrt_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + square() const { + return unaryExpr(internal::scalar_square_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cube() const { + return unaryExpr(internal::scalar_cube_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + inverse() const { + return unaryExpr(internal::scalar_inverse_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + tanh() const { + return unaryExpr(internal::scalar_tanh_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + lgamma() const { + return unaryExpr(internal::scalar_lgamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + digamma() const { + return unaryExpr(internal::scalar_digamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i0() const { + return unaryExpr(internal::scalar_bessel_i0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i0e() const { + return unaryExpr(internal::scalar_bessel_i0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i1() const { + return unaryExpr(internal::scalar_bessel_i1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i1e() const { + return unaryExpr(internal::scalar_bessel_i1e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_j0() const { + return unaryExpr(internal::scalar_bessel_j0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_y0() const { + return unaryExpr(internal::scalar_bessel_y0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_j1() const { + return unaryExpr(internal::scalar_bessel_j1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_y1() const { + return unaryExpr(internal::scalar_bessel_y1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k0() const { + return unaryExpr(internal::scalar_bessel_k0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k0e() const { + return unaryExpr(internal::scalar_bessel_k0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k1() const { + return unaryExpr(internal::scalar_bessel_k1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k1e() const { + return unaryExpr(internal::scalar_bessel_k1e_op()); + } + + // igamma(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_op()); + } + + // igamma_der_a(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma_der_a(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op()); + } + + // gamma_sample_der_alpha(alpha = this, sample = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + gamma_sample_der_alpha(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op()); + } + + // igammac(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igammac(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igammac_op()); + } + + // zeta(x = this, q = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + zeta(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_zeta_op()); + } + + // polygamma(n = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + polygamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_polygamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erf() const { + return unaryExpr(internal::scalar_erf_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erfc() const { + return unaryExpr(internal::scalar_erfc_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ndtri() const { + return unaryExpr(internal::scalar_ndtri_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sigmoid() const { + return unaryExpr(internal::scalar_logistic_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + exp() const { + return unaryExpr(internal::scalar_exp_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + expm1() const { + return unaryExpr(internal::scalar_expm1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log() const { + return unaryExpr(internal::scalar_log_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log1p() const { + return unaryExpr(internal::scalar_log1p_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log2() const { + return unaryExpr(internal::scalar_log2_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + abs() const { + return unaryExpr(internal::scalar_abs_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + arg() const { + return unaryExpr(internal::scalar_arg_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + clip(Scalar min, Scalar max) const { + return unaryExpr(internal::scalar_clamp_op(min, max)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const std::conditional_t::IsComplex, + TensorCwiseUnaryOp, const Derived>, + Derived> + conjugate() const { + return choose(Cond::IsComplex>(), unaryExpr(internal::scalar_conjugate_op()), derived()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::enable_if_t::Real>::value, + TensorCwiseUnaryOp, const Derived>> + pow(ScalarExponent exponent) const + { + return unaryExpr(internal::scalar_unary_pow_op(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + real() const { + return unaryExpr(internal::scalar_real_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + imag() const { + return unaryExpr(internal::scalar_imag_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator+ (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator+ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator- (Scalar rhs) const { + EIGEN_STATIC_ASSERT((NumTraits::IsSigned || internal::is_same >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator- (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator* (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator* (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator/ (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator/ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator% (Scalar rhs) const { + EIGEN_STATIC_ASSERT(NumTraits::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD); + return unaryExpr(internal::scalar_mod_op(rhs)); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMax(Scalar threshold) const { + return cwiseMax(constant(threshold)); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMin(Scalar threshold) const { + return cwiseMin(constant(threshold)); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const std::conditional_t::value, + Derived, + TensorConversionOp > + cast() const { + return choose(Cond::value>(), derived(), TensorConversionOp(derived())); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + round() const { + return unaryExpr(internal::scalar_round_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + rint() const { + return unaryExpr(internal::scalar_rint_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ceil() const { + return unaryExpr(internal::scalar_ceil_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + floor() const { + return unaryExpr(internal::scalar_floor_op()); + } + + // Generic binary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp + binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { + return TensorCwiseBinaryOp(derived(), other, func); + } + + // Coefficient-wise binary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator+(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_sum_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator-(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_difference_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator*(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_product_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator/(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_quotient_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMax(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_max_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMin(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_min_op()); + } + + // logical operators + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator&&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator||(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_bitwise_and_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator|(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_bitwise_or_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator^(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_bitwise_xor_op()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseUnaryOp, const Derived> + operator!() const { + return unaryExpr(internal::scalar_boolean_not_op()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseUnaryOp, const Derived> + operator~() const { + return unaryExpr(internal::scalar_bitwise_not_op()); + } + + // Comparisons and tests. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<=(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>=(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator==(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator!=(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + // comparisons and tests for Scalars + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator<(Scalar threshold) const { + return operator<(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator<=(Scalar threshold) const { + return operator<=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator>(Scalar threshold) const { + return operator>(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator>=(Scalar threshold) const { + return operator>=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator==(Scalar threshold) const { + return operator==(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator!=(Scalar threshold) const { + return operator!=(constant(threshold)); + } + + // Predicates. + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorConversionOp, const Derived>> + (isnan)() const { + return unaryExpr(internal::scalar_isnan_op()).template cast(); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorConversionOp, const Derived>> + (isinf)() const { + return unaryExpr(internal::scalar_isinf_op()).template cast(); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorConversionOp, const Derived>> + (isfinite)() const { + return unaryExpr(internal::scalar_isfinite_op()).template cast(); + } + + // Coefficient-wise ternary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { + return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); + } + + // Contractions. + typedef Eigen::IndexPair DimensionPair; + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims) const { + return TensorContractionOp(derived(), other.derived(), dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const { + return TensorContractionOp(derived(), other.derived(), dims, output_kernel); + } + + // Convolutions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConvolutionOp + convolve(const KernelDerived& kernel, const Dimensions& dims) const { + return TensorConvolutionOp(derived(), kernel.derived(), dims); + } + + // Fourier transforms + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorFFTOp + fft(const FFT& dims) const { + return TensorFFTOp(derived(), dims); + } + + // Scan. + typedef TensorScanOp, const Derived> TensorScanSumOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanSumOp + cumsum(const Index& axis, bool exclusive = false) const { + return TensorScanSumOp(derived(), axis, exclusive); + } + + typedef TensorScanOp, const Derived> TensorScanProdOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanProdOp + cumprod(const Index& axis, bool exclusive = false) const { + return TensorScanProdOp(derived(), axis, exclusive); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanOp + scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const { + return TensorScanOp(derived(), axis, exclusive, reducer); + } + + // Reductions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + sum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + sum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::SumReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + mean(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MeanReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + mean() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MeanReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + prod(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::ProdReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + prod() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::ProdReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + maximum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + } + + template + const TensorReductionOp, const DimensionList, const Derived> + maximum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + minimum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + } + + template + const TensorReductionOp, const DimensionList, const Derived> + minimum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp::value, Derived, TensorConversionOp > > + all(const Dims& dims) const { + return cast().reduce(dims, internal::AndReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const std::conditional_t::value, Derived, TensorConversionOp > > + all() const { + DimensionList in_dims; + return cast().reduce(in_dims, internal::AndReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp::value, Derived, TensorConversionOp > > + any(const Dims& dims) const { + return cast().reduce(dims, internal::OrReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const std::conditional_t::value, Derived, TensorConversionOp > > + any() const { + DimensionList in_dims; + return cast().reduce(in_dims, internal::OrReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPairReducerOp< + internal::ArgMaxPairReducer >, + const array, const Derived> + argmax() const { + array in_dims; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorPairReducerOp< + internal::ArgMaxPairReducer >, + const array, + const Derived>(derived(), internal::ArgMaxPairReducer >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPairReducerOp< + internal::ArgMinPairReducer >, + const array, const Derived> + argmin() const { + array in_dims; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorPairReducerOp< + internal::ArgMinPairReducer >, + const array, + const Derived>(derived(), internal::ArgMinPairReducer >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPairReducerOp< + internal::ArgMaxPairReducer >, + const array, const Derived> + argmax(const Index return_dim) const { + array in_dims; + in_dims[0] = return_dim; + return TensorPairReducerOp< + internal::ArgMaxPairReducer >, + const array, + const Derived>(derived(), internal::ArgMaxPairReducer >(), return_dim, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPairReducerOp< + internal::ArgMinPairReducer >, + const array, const Derived> + argmin(const Index return_dim) const { + array in_dims; + in_dims[0] = return_dim; + return TensorPairReducerOp< + internal::ArgMinPairReducer >, + const array, + const Derived>(derived(), internal::ArgMinPairReducer >(), return_dim, in_dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp + reduce(const Dims& dims, const Reducer& reducer) const { + return TensorReductionOp(derived(), dims, reducer); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTraceOp + trace(const Dims& dims) const { + return TensorTraceOp(derived(), dims); + } + + const TensorTraceOp, const Derived> + trace() const { + DimensionList in_dims; + return TensorTraceOp, const Derived>(derived(), in_dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorBroadcastingOp + broadcast(const Broadcast& bcast) const { + return TensorBroadcastingOp(derived(), bcast); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, Axis axis) const { + return TensorConcatenationOp(derived(), other.derived(), axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPatchOp + extract_patches(const PatchDims& patch_dims) const { + return TensorPatchOp(derived(), patch_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1, + const Index row_stride = 1, const Index col_stride = 1, + const Index in_row_stride = 1, const Index in_col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, 1, 1, padding_type, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride, + const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top, const Index padding_bottom, + const Index padding_left,const Index padding_right, + const Scalar padding_value) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride, + padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { + return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride, const Index row_stride, const Index col_stride, + const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top_z, const Index padding_bottom_z, + const Index padding_top, const Index padding_bottom, + const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { + return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + // Morphing operators. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range) + return TensorChippingOp(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorRollOp + roll(const Rolls& rolls) const { + return TensorRollOp(derived(), rolls); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding) const { + return TensorPaddingOp(derived(), padding, internal::scalar_cast_op()(0)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding, const Scalar padding_value) const { + return TensorPaddingOp(derived(), padding, padding_value); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shfl) const { + return TensorShufflingOp(derived(), shfl); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorInflationOp + inflate(const Strides& strides) const { + return TensorInflationOp(derived(), strides); + } + + // Returns a tensor containing index/value pairs + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorIndexPairOp + index_pairs() const { + return TensorIndexPairOp(derived()); + } + + // Support for custom unary and binary operations + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomUnaryOp customOp(const CustomUnaryFunc& op) const { + return TensorCustomUnaryOp(derived(), op); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomBinaryOp customOp(const OtherDerived& other, const CustomBinaryFunc& op) const { + return TensorCustomBinaryOp(derived(), other, op); + } + + // Force the evaluation of the expression. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorForcedEvalOp eval() const { + return TensorForcedEvalOp(derived()); + } + + // Returns a formatted tensor ready for printing to a stream + template + inline const TensorWithFormat format(const Format& fmt) const { + return TensorWithFormat(derived(), fmt); + } + + #ifdef EIGEN_READONLY_TENSORBASE_PLUGIN + #include EIGEN_READONLY_TENSORBASE_PLUGIN + #endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } + + protected: + template friend class Tensor; + template friend class TensorFixedSize; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; +}; + +template::value> +class TensorBase : public TensorBase { + public: + typedef TensorBase Base; + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef Scalar CoeffReturnType; + static constexpr int NumDimensions = DerivedTraits::NumDimensions; + + template friend class Tensor; + template friend class TensorFixedSize; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { + return setConstant(Scalar(0)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { + return derived() = this->constant(val); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->random(); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->template random(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setValues( + const typename internal::Initializer::InitList& vals) { + TensorEvaluator eval(derived(), DefaultDevice()); + internal::initialize_tensor(eval, vals); + return derived(); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const OtherDerived& other) { + return derived() = derived() + other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const OtherDerived& other) { + return derived() = derived() - other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const OtherDerived& other) { + return derived() = derived() * other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const OtherDerived& other) { + return derived() = derived() / other.derived(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp + swap_layout() { + return TensorLayoutSwapOp(derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) const { + return TensorConcatenationOp(derived(), other, axis); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) { + return TensorConcatenationOp(derived(), other, axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReshapingOp + reshape(const NewDimensions& newDimensions) { + return TensorReshapingOp(derived(), newDimensions); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) { + return TensorSlicingOp(derived(), startIndices, sizes); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range) + return TensorChippingOp(derived(), offset, DimId); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset) { + EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range) + return TensorChippingOp(derived(), offset, DimId); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset, const Index dim) { + return TensorChippingOp(derived(), offset, dim); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReverseOp + reverse(const ReverseDimensions& rev) { + return TensorReverseOp(derived(), rev); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorRollOp + roll(const Rolls& roll) const { + return TensorRollOp(derived(), roll); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorRollOp + roll(const Rolls& roll) { + return TensorRollOp(derived(), roll); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shfl) const { + return TensorShufflingOp(derived(), shfl); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp + shuffle(const Shuffle& shfl) { + return TensorShufflingOp(derived(), shfl); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp + stride(const Strides& strides) { + return TensorStridingOp(derived(), strides); + } + + // Select the device on which to evaluate the expression. + template + TensorDevice device(const DeviceType& dev) { + return TensorDevice(dev, derived()); + } + + // Select the async device on which to evaluate the expression. + template + TensorAsyncDevice device(const DeviceType& dev, DoneCallback done) { + return TensorAsyncDevice(dev, derived(), std::move(done)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } + + #ifdef EIGEN_TENSORBASE_PLUGIN + #include EIGEN_TENSORBASE_PLUGIN + #endif + + protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase) + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase) + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(derived(), other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return derived(); + } +}; +#endif // EIGEN_PARSED_BY_DOXYGEN +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h new file mode 100644 index 00000000000..0b068a7c99e --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -0,0 +1,1474 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H +#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +// -------------------------------------------------------------------------- // +// Forward declarations for templates defined below. +template +class TensorBlockIO; + +// -------------------------------------------------------------------------- // +// Helper function to compute strides for densely stored buffer of given +// dimensions. + +// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use +// this function instead everywhere. +template +EIGEN_ALWAYS_INLINE DSizes strides(const DSizes& dimensions) { + DSizes strides; + if (NumDims == 0) return strides; + + // TODO(ezhulenev): Use templates to unroll this loop (similar to + // h_array_reduce in CXX11meta.h)? Benchmark it. + if (static_cast(Layout) == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * dimensions[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dimensions[i + 1]; + } + } + + return strides; +} + +template +EIGEN_ALWAYS_INLINE DSizes strides(const Eigen::array& dimensions) { + return strides(DSizes(dimensions)); +} + +template +EIGEN_STRONG_INLINE DSizes strides(const Sizes& sizes) { + return strides(DSizes(sizes)); +} + +// -------------------------------------------------------------------------- // + +// Tensor block shape type defines what are the shape preference for the blocks +// extracted from the larger tensor. +// +// Example: blocks of 100 elements from the large 100x100 tensor: +// - tensor: 100x100 +// - target_block_size: 100 +// +// TensorBlockShapeType: +// - kUniformAllDims: 100 blocks of size 10x10 +// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column +// or row major layout) +enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; + +struct TensorBlockResourceRequirements { + TensorBlockShapeType shape_type; // target block shape + size_t size; // target block size + TensorOpCost cost_per_coeff; // cost of computing a single block element + +#ifdef EIGEN_HIPCC + // For HIPCC, we need to explicitly declare as a "device fun", the constructor + // which is implicitly invoked in the "merge" / "any" routines. else HIPCC + // errors out complaining about the lack of a matching constructor + EIGEN_DEVICE_FUNC TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, TensorOpCost cost_) + : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) {} +#endif + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, + size_t size_in_bytes, TensorOpCost cost) { + const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); + return {shape_type, size, cost}; + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, + size_t size_in_bytes) { + // This default cost per coefficient is valid for most materialized tensor + // block evaluation implementations, because they typically just read + // coefficients from the underlying tensor storage, and write to the tensor + // block buffer (scratch or destination memory, reads and writes have linear + // access pattern). We ignore the fixed cost of block evaluation, because in + // practice it should negligible. + // + // Lazy block evaluation adds the cost of calling a functor for each + // coefficient. + // + // All non-trivial block evaluation implementations must provide their own + // cost approximation (e.g. shuffling inner dimension has a much higher cost + // because it reads memory randomly, although the total number of moved + // bytes is the same). + return withShapeAndSize(shape_type, size_in_bytes, + {/*bytes_loaded=*/sizeof(Scalar), + /*bytes_stored=*/sizeof(Scalar), + /*compute_cycles=*/0}); + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(size_t size_in_bytes) { + return withShapeAndSize(TensorBlockShapeType::kSkewedInnerDims, size_in_bytes); + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(size_t size_in_bytes) { + return withShapeAndSize(TensorBlockShapeType::kUniformAllDims, size_in_bytes); + } + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements + merge(const TensorBlockResourceRequirements& lhs, const TensorBlockResourceRequirements& rhs) { + return {merge(lhs.shape_type, rhs.shape_type), // shape_type + merge(lhs.size, rhs.size), // size + merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff + } + + EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(TensorOpCost cost) { + cost_per_coeff += cost; + return *this; + } + + // This is a resource requirement that should be returned from expressions + // that do not have any block evaluation preference (e.g. default tensor + // expression with raw buffer access). + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { + return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; + } + + private: + using Requirements = TensorBlockResourceRequirements; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { + return numext::maxi(lhs_size, rhs_size); + } + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs, + TensorBlockShapeType rhs) { + return (lhs == TensorBlockShapeType::kSkewedInnerDims || rhs == TensorBlockShapeType::kSkewedInnerDims) + ? TensorBlockShapeType::kSkewedInnerDims + : TensorBlockShapeType::kUniformAllDims; + } + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost) { + return lhs_cost + rhs_cost; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockDescriptor specifies a block offset within a tensor and the block +// sizes along each of the tensor dimensions. + +template +class TensorBlockDescriptor { + public: + typedef DSizes Dimensions; + + // If we evaluate a Tensor assignment, and expression on the left, already has + // a memory buffer, then we might do performance optimization, and evaluate + // the root expression directly into the final output memory. Some time it's + // possible to reuse it for materializing subexpressions inside an expression + // tree, to to avoid dynamic memory allocation. + // + // The pointer type of the underlying storage is erased, because passing + // Scalar type through all the expression evaluation layers is way too many + // templates. In practice destination buffer type should always match the + // evaluated expression scalar type. + class DestinationBuffer { + public: + enum DestinationBufferKind : int { + // The above explicit specification of "int" as the enum basetype is + // needed to get around a HIPCC link error ("the field type is not + // amp-compatible") + // which is issued for class members with the enum type. + // TODO(rocm): + // remove the "int" basetype once HIPCC has been fixed to not error out + // in the above scenario. + + // Destination buffer is not defined (`m_data` == nullptr). + kEmpty, + + // Tensor block defined by an owning tensor block descriptor can fit + // contiguously into the destination buffer. In this case it's safe to + // materialize tensor block in the destination buffer, wrap it in a + // TensorMap, and use to build Eigen expression on top of it. + kContiguous, + + // Destination buffer strides do not match strides of the contiguously + // stored block, and it's impossible to define a TensorMap over this + // buffer. However if we are evaluating a root of an expression tree, we + // still can materialize an output into this destination, because we can + // guarantee that no one will ever access it through block API. + // + // In theory it is possible to build valid TensorStriding + // expression on top of this destination buffer, however it has + // inefficient coeff/packet access, and defeats the purpose of fast block + // evaluation API. + kStrided + }; + + template + Scalar* data() const { + eigen_assert(m_data_type_size == sizeof(Scalar)); + return static_cast(m_data); + } + + const Dimensions& strides() const { return m_strides; } + const DestinationBufferKind& kind() const { return m_kind; } + + private: + friend class TensorBlockDescriptor; + + DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} + + template + DestinationBuffer(Scalar* data, const Dimensions& strides, DestinationBufferKind kind) + : m_data(static_cast(data)), m_data_type_size(sizeof(Scalar)), m_strides(strides), m_kind(kind) {} + + template + static DestinationBuffer make(const TensorBlockDescriptor& desc, Scalar* data, const Dimensions& strides) { + return DestinationBuffer(data, strides, kind(desc, strides)); + } + + template + static DestinationBufferKind kind(const TensorBlockDescriptor& desc, const Dimensions& strides) { + const Dimensions& desc_dims = desc.dimensions(); + const Dimensions& desc_strides = internal::strides(desc_dims); + for (int i = 0; i < NumDims; ++i) { + if (desc_dims[i] == 1) continue; + if (desc_strides[i] != strides[i]) return kStrided; + } + return kContiguous; + } + + // Storage pointer is type erased, to reduce template bloat, but we still + // keep the size of the underlying element type for error checking. + void* m_data; + size_t m_data_type_size; + + // Destination buffer dimensions always match the dimensions of a tensor + // block descriptor it belongs to, however strides might be different. + Dimensions m_strides; + + DestinationBufferKind m_kind; + }; + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, const DestinationBuffer& destination) + : m_offset(offset), m_dimensions(dimensions), m_destination(destination) {} + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) + : m_offset(offset), m_dimensions(dimensions), m_destination(DestinationBuffer()) {} + + IndexType offset() const { return m_offset; } + const Dimensions& dimensions() const { return m_dimensions; } + IndexType dimension(int index) const { return m_dimensions[index]; } + IndexType size() const { return array_prod(m_dimensions); } + + const DestinationBuffer& destination() const { return m_destination; } + + template + void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { + eigen_assert(dst_base != NULL); + m_destination = DestinationBuffer::template make(*this, dst_base, dst_strides); + } + + template + void AddDestinationBuffer(Scalar* dst_base, const DSizes& dst_strides) { + // DSizes constructor will do index type promotion if it's safe. + AddDestinationBuffer(dst_base, Dimensions(dst_strides)); + } + + TensorBlockDescriptor& DropDestinationBuffer() { + m_destination.m_data = NULL; + m_destination.m_kind = DestinationBuffer::kEmpty; + return *this; + } + + bool HasDestinationBuffer() const { return m_destination.kind() != DestinationBuffer::kEmpty; } + + // Returns a copy of `*this` with updated offset. + TensorBlockDescriptor WithOffset(IndexType offset) const { + return TensorBlockDescriptor(offset, m_dimensions, m_destination); + } + + private: + // Offset and dimensions are immutable after construction. Block descriptor + // can only be mutated by adding or dropping destination. + const IndexType m_offset; + const Dimensions m_dimensions; + DestinationBuffer m_destination; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockMapper is responsible for iterating over the blocks of a tensor. + +template +class TensorBlockMapper { + typedef TensorBlockDescriptor BlockDescriptor; + + public: + typedef DSizes Dimensions; + + TensorBlockMapper() = default; + TensorBlockMapper(const DSizes& dimensions, const TensorBlockResourceRequirements& requirements) + : m_tensor_dimensions(dimensions), m_requirements(requirements) { + // Compute block dimensions and the total number of blocks. + InitializeBlockDimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { return m_total_block_count; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { return m_block_dimensions.TotalSize(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& blockDimensions() const { + return m_block_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const { + static const bool isColMajor = Layout == static_cast(ColMajor); + + IndexType offset = 0; + DSizes dimensions; + + if (NumDims == 0) return BlockDescriptor(offset, dimensions); + + // Iterate outer -> inner dimensions. + for (int i = NumDims - 1; i >= 0; --i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + const IndexType idx = block_index / m_block_strides[dim]; + block_index -= idx * m_block_strides[dim]; + + const IndexType coord = idx * m_block_dimensions[dim]; + dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, m_block_dimensions[dim]); + offset += coord * m_tensor_strides[dim]; + } + + return {offset, dimensions}; + } + + private: + void InitializeBlockDimensions() { + // Requested block shape and size. + const TensorBlockShapeType shape_type = m_requirements.shape_type; + IndexType target_block_size = numext::maxi(1, static_cast(m_requirements.size)); + + IndexType tensor_size = m_tensor_dimensions.TotalSize(); + + // Corner case: one of the dimensions is zero. Logic below is too complex + // to handle this case on a general basis, just use unit block size. + // Note: we must not yield blocks with zero dimensions (recipe for + // overflows/underflows, divisions by zero and NaNs later). + if (tensor_size == 0) { + for (int i = 0; i < NumDims; ++i) { + m_block_dimensions[i] = 1; + } + m_total_block_count = 0; + return; + } + + // If tensor fits into a target block size, evaluate it as a single block. + if (tensor_size <= target_block_size) { + m_block_dimensions = m_tensor_dimensions; + m_total_block_count = 1; + // The only valid block index is `0`, and in this case we do not need + // to compute real strides for tensor or blocks (see blockDescriptor). + for (int i = 0; i < NumDims; ++i) { + m_tensor_strides[i] = 0; + m_block_strides[i] = 1; + } + return; + } + + static const bool isColMajor = Layout == static_cast(ColMajor); + + // Block shape skewed towards inner dimension. + if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { + IndexType coeff_to_allocate = target_block_size; + + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); + coeff_to_allocate = + numext::div_ceil(coeff_to_allocate, numext::maxi(static_cast(1), m_block_dimensions[dim])); + } + eigen_assert(coeff_to_allocate == 1); + + } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { + // Tensor will not fit within 'target_block_size' budget: calculate tensor + // block dimension sizes based on "square" dimension size target. + const IndexType dim_size_target = convert_index( + std::pow(static_cast(target_block_size), 1.0f / static_cast(m_block_dimensions.rank()))); + + for (int i = 0; i < NumDims; ++i) { + // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it + // a multiple of the packet size. Note that reducing + // 'block_dim_size' in this manner can increase the number of + // blocks, and so will amplify any per-block overhead. + m_block_dimensions[i] = numext::mini(dim_size_target, m_tensor_dimensions[i]); + } + + // Add any un-allocated coefficients to inner dimension(s). + IndexType total_size = m_block_dimensions.TotalSize(); + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { + const IndexType total_size_other_dims = total_size / m_block_dimensions[dim]; + const IndexType alloc_avail = numext::div_ceil(target_block_size, total_size_other_dims); + if (alloc_avail == m_block_dimensions[dim]) { + // Insufficient excess coefficients to allocate. + break; + } + m_block_dimensions[dim] = numext::mini(m_tensor_dimensions[dim], alloc_avail); + total_size = total_size_other_dims * m_block_dimensions[dim]; + } + } + + } else { + eigen_assert(false); // unknown block shape + } + + eigen_assert(m_block_dimensions.TotalSize() >= + numext::mini(target_block_size, m_tensor_dimensions.TotalSize())); + + // Calculate block counts by dimension and total block count. + DSizes block_count; + for (int i = 0; i < NumDims; ++i) { + block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]); + } + m_total_block_count = array_prod(block_count); + + // Calculate block strides (used for enumerating blocks). + m_tensor_strides = strides(m_tensor_dimensions); + m_block_strides = strides(block_count); + } + + DSizes m_tensor_dimensions; + TensorBlockResourceRequirements m_requirements; + + DSizes m_block_dimensions; + IndexType m_total_block_count; + + DSizes m_tensor_strides; + DSizes m_block_strides; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockScratchAllocator is responsible for allocating temporary buffers +// for block evaluation (output or input block materialization). Given that +// Eigen expression traversal order is deterministic, all temporary allocations +// are happening in the same order, and usually have exactly the same size. +// Scratch allocator keeps a trace of all dynamic allocations, and after the +// first block evaluation is completed, we should be able to reuse all the +// temporary buffers for the next block evaluation. + +template +class TensorBlockScratchAllocator { + public: + explicit TensorBlockScratchAllocator(const Device& device) : m_device(device), m_allocation_index(0) {} + + ~TensorBlockScratchAllocator() { + for (size_t i = 0; i < m_allocations.size(); ++i) { + m_device.deallocate(m_allocations[i].ptr); + } + } + + void* allocate(size_t size) { + // TODO(ezhulenev): Remove when replaced with inlined vector. + if (m_allocations.capacity() == 0) m_allocations.reserve(8); + + // Check if we already have an existing allocation att current index. + const int num_allocations = static_cast(m_allocations.size()); + const bool has_allocation = m_allocation_index < num_allocations; + + // Allocation index can't be larger than the number of allocations. + eigen_assert(m_allocation_index <= num_allocations); + + // If we have existing allocation, and its size is larger or equal to + // requested size, we do nothing. + + // If current allocation can't fit requested size, we deallocate it, and + // replace with a larger allocation. + if (has_allocation && m_allocations[m_allocation_index].size < size) { + m_device.deallocate(m_allocations[m_allocation_index].ptr); + m_allocations[m_allocation_index].ptr = m_device.allocate(size); + m_allocations[m_allocation_index].size = size; + } + + // Make a new allocation if we don't have and existing one. + if (!has_allocation) { + Allocation allocation; + allocation.ptr = m_device.allocate(size); + allocation.size = size; + m_allocations.push_back(allocation); + } + + eigen_assert(m_allocations[m_allocation_index].ptr != NULL); + eigen_assert(m_allocations[m_allocation_index].size >= size); + + return m_allocations[m_allocation_index++].ptr; + } + + void reset() { m_allocation_index = 0; } + + private: + struct Allocation { + void* ptr; + size_t size; + }; + + const Device& m_device; + int m_allocation_index; + // TODO(ezhulenev): This should be an inlined vector. + std::vector m_allocations; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockKind represents all possible block kinds, that can be produced by +// TensorEvaluator::evalBlock function. +enum TensorBlockKind { + // Tensor block that is a lazy expression that must be assigned to a + // destination using TensorBlockAssign. + kExpr, + + // Tensor block that is a view into a memory buffer owned by an underlying + // Tensor expression (e.g. it can be a view into a Tensor buffer). + kView, + + // Tensor block that was materialized in a scratch memory buffer, allocated + // with TensorBlockScratchAllocator. This block must be copied to a + // destination, similar to a block of `kExpr` type. + kMaterializedInScratch, + + // Tensor block that was materialized directly into the final output memory + // buffer. For example if the left side of an assignment is a Tensor, we can + // directly materialize the block in the destination memory. + // + // If strides in the output buffer do not match tensor block strides, the + // Tensor expression will be invalid, and should not be used by + // TensorBlockAssign or for constructing another block expression. + kMaterializedInOutput +}; + +// -------------------------------------------------------------------------- // +// TensorBlockNotImplemented should be used to defined TensorBlock typedef in +// TensorEvaluators that do not support block evaluation. + +class TensorBlockNotImplemented { + public: + typedef void XprType; +}; + +// -------------------------------------------------------------------------- // +// XprScalar extracts Scalar type from the Eigen expressions (if expression type +// is not void). It's required to be able to define lazy block expression for +// argument types, that do not support block evaluation. + +template +struct XprScalar { + typedef typename XprType::Scalar type; +}; +template <> +struct XprScalar { + typedef void type; +}; + +// -------------------------------------------------------------------------- // +// TensorMaterializedBlock is a fully evaluated block of the original tensor, +// and XprType is just a TensorMap over the data. This block type is typically +// used to materialize blocks of tensor expressions, that can't be efficiently +// represented as lazy Tensor expressions with fast coeff/packet operations, +// e.g. we materialize all broadcasts into evaluated blocks. +// +// TensorMaterializedBlock does not own its memory buffer, it's either a memory +// buffer that backs the original expression (e.g. block is just a view into a +// Tensor), or a memory buffer allocated with scratch allocator, and in this +// case the scratch allocator will deallocate it at the end of block based +// expression execution. +// +// If the block was evaluated directly into the output buffer, and strides in +// the output buffer do not match block strides, the TensorMap expression will +// be invalid, and should never be used in block assignment or any other tensor +// expression. + +template +class TensorMaterializedBlock { + public: + typedef DSizes Dimensions; + typedef TensorMap > XprType; + + TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, const Dimensions& dimensions, + bool valid_expr = true) + : m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) { + eigen_assert(m_kind == internal::TensorBlockKind::kView || + m_kind == internal::TensorBlockKind::kMaterializedInScratch || + m_kind == internal::TensorBlockKind::kMaterializedInOutput); + } + + TensorBlockKind kind() const { return m_kind; } + // NOTE(ezhulenev): Returning XprType by value like in other block types + // causes asan failures. The theory is that XprType::Nested doesn't work + // properly for TensorMap. + const XprType& expr() const { + eigen_assert(m_valid_expr); + return m_expr; + } + const Scalar* data() const { return m_data; } + void cleanup() {} + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + + // TensorMaterializedBlock can be backed by different types of storage: + // + // (1) Contiguous block of memory allocated with scratch allocator. + // (2) Contiguous block of memory reused from tensor block descriptor + // destination buffer. + // (3) Strided block of memory reused from tensor block descriptor + // destination buffer. + // + class Storage { + public: + Scalar* data() const { return m_data; } + const Dimensions& dimensions() const { return m_dimensions; } + const Dimensions& strides() const { return m_strides; } + + TensorMaterializedBlock AsTensorMaterializedBlock() const { + return TensorMaterializedBlock(m_materialized_in_output ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + m_data, m_dimensions, !m_strided_storage); + } + + private: + friend class TensorMaterializedBlock; + + Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output, + bool strided_storage) + : m_data(data), + m_dimensions(dimensions), + m_strides(strides), + m_materialized_in_output(materialized_in_output), + m_strided_storage(strided_storage) {} + + Scalar* m_data; + Dimensions m_dimensions; + Dimensions m_strides; + bool m_materialized_in_output; + bool m_strided_storage; + }; + + // Creates a storage for materialized block either from the block descriptor + // destination buffer, or allocates a new buffer with scratch allocator. + template + EIGEN_STRONG_INLINE static Storage prepareStorage(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool allow_strided_storage = false) { + // Try to reuse destination as an output block buffer. + typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; + + if (desc.destination().kind() == DestinationBuffer::kContiguous) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), internal::strides(desc.dimensions()), + /*materialized_in_output=*/true, + /*strided_storage=*/false); + + } else if (desc.destination().kind() == DestinationBuffer::kStrided && allow_strided_storage) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), desc.destination().strides(), + /*materialized_in_output=*/true, /*strided_storage=*/true); + + } else { + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + return Storage(static_cast(mem), desc.dimensions(), internal::strides(desc.dimensions()), + /*materialized_in_output=*/false, + /*strided_storage=*/false); + } + } + + // Creates a materialized block for the given descriptor from a memory buffer. + template + EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(const Scalar* data, const DataDimensions& data_dims, + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + eigen_assert(array_size::value == desc.dimensions().size()); + + // If a tensor block dimensions covers a contiguous block of the underlying + // memory, we can skip block buffer memory allocation, and construct a block + // from existing `data` memory buffer. + // + // Example: (RowMajor layout) + // data_dims: [11, 12, 13, 14] + // desc.dimensions(): [1, 1, 3, 14] + // + // In this case we can construct a TensorBlock starting at + // `data + desc.offset()`, with a `desc.dimensions()` block sizes. + static const bool is_col_major = Layout == ColMajor; + + // Find out how many inner dimensions have a matching size. + int num_matching_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (data_dims[dim] != desc.dimensions()[dim]) break; + ++num_matching_inner_dims; + } + + // All the outer dimensions must be of size `1`, except a single dimension + // before the matching inner dimension (`3` in the example above). + bool can_use_direct_access = true; + for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (desc.dimension(dim) != 1) { + can_use_direct_access = false; + break; + } + } + + if (can_use_direct_access) { + const Scalar* block_start = data + desc.offset(); + return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions()); + + } else { + // Reuse destination buffer or allocate new buffer with scratch allocator. + const Storage storage = prepareStorage(desc, scratch); + + typedef internal::TensorBlockIO TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), data, desc.offset()); + TensorBlockIODst dst(storage.dimensions(), storage.strides(), storage.data()); + + TensorBlockIO::Copy(dst, src); + return storage.AsTensorMaterializedBlock(); + } + } + + private: + TensorBlockKind m_kind; + const Scalar* m_data; + Dimensions m_dimensions; + XprType m_expr; + bool m_valid_expr; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseUnaryBlock { + static constexpr bool NoArgBlockAccess = internal::is_void::value; + + public: + typedef std::conditional_t > + XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) + : m_arg_block(arg_block), m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + UnaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseBinaryBlock { + static constexpr bool NoArgBlockAccess = internal::is_void::value || + internal::is_void::value; + + public: + typedef std::conditional_t< + NoArgBlockAccess, void, + TensorCwiseBinaryOp > + XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, const RhsTensorBlock& right_block, const BinaryOp& functor) + : m_left_block(left_block), m_right_block(right_block), m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); } + + const Scalar* data() const { return NULL; } + + void cleanup() { + m_left_block.cleanup(); + m_right_block.cleanup(); + } + + private: + LhsTensorBlock m_left_block; + RhsTensorBlock m_right_block; + BinaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorUnaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from a block of the underlying type (this is a +// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). + +template +class TensorUnaryExprBlock { + typedef typename ArgTensorBlock::XprType ArgXprType; + static constexpr bool NoArgBlockAccess = internal::is_void::value; + + public: + typedef std::conditional_t::type> XprType; + + typedef typename XprScalar::type Scalar; + + TensorUnaryExprBlock(const ArgTensorBlock& arg_block, const BlockFactory& factory) + : m_arg_block(arg_block), m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { return m_factory.expr(m_arg_block.expr()); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// TensorTernaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from three blocks of the underlying type. + +template +class TensorTernaryExprBlock { + typedef typename Arg1TensorBlock::XprType Arg1XprType; + typedef typename Arg2TensorBlock::XprType Arg2XprType; + typedef typename Arg3TensorBlock::XprType Arg3XprType; + + static constexpr bool NoArgBlockAccess = internal::is_void::value || + internal::is_void::value || + internal::is_void::value; + + public: + typedef std::conditional_t::type> + XprType; + + typedef typename XprScalar::type Scalar; + + TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, const Arg2TensorBlock& arg2_block, + const Arg3TensorBlock& arg3_block, const BlockFactory& factory) + : m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); } + const Scalar* data() const { return NULL; } + void cleanup() { + m_arg1_block.cleanup(); + m_arg2_block.cleanup(); + m_arg3_block.cleanup(); + } + + private: + Arg1TensorBlock m_arg1_block; + Arg2TensorBlock m_arg2_block; + Arg3TensorBlock m_arg3_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// StridedLinearBufferCopy provides a method to copy data between two linear +// buffers with different strides, with optimized paths for scatter/gather. + +template +class StridedLinearBufferCopy { + typedef typename packet_traits::type Packet; + typedef typename unpacket_traits::half HalfPacket; + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + HasHalfPacket = static_cast(HalfPacketSize) < static_cast(PacketSize) + }; + + public: + // Specifying linear copy kind statically gives ~30% speedup for small sizes. + enum class Kind { + Linear = 0, // src_stride == 1 && dst_stride == 1 + Scatter = 1, // src_stride == 1 && dst_stride != 1 + FillLinear = 2, // src_stride == 0 && dst_stride == 1 + FillScatter = 3, // src_stride == 0 && dst_stride != 1 + Gather = 4, // dst_stride == 1 + Random = 5 // everything else + }; + + struct Dst { + Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + Scalar* data; + }; + + struct Src { + Src(IndexType o, IndexType s, const Scalar* d) : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + const Scalar* data; + }; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, const Src& src, const size_t count) { + Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data); + } + + private: + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset, + const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const IndexType src_offset, const IndexType src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + const Scalar* src = &src_data[src_offset]; + Scalar* dst = &dst_data[dst_offset]; + + if (!Vectorizable) { + for (Index i = 0; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + return; + } + + const IndexType vectorized_size = PacketSize * (count / PacketSize); + IndexType i = 0; + + if (kind == StridedLinearBufferCopy::Kind::Linear) { + // ******************************************************************** // + // Linear copy from `src` to `dst`. + const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize)); + eigen_assert(src_stride == 1 && dst_stride == 1); + for (; i < unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + Packet p = ploadu(src + i + j * PacketSize); + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i < vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pstoreu(dst + i, p); + } + if (HasHalfPacket) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { + HalfPacket p = ploadu(src + i); + pstoreu(dst + i, p); + i += HalfPacketSize; + } + } + for (; i < count; ++i) { + dst[i] = src[i]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { + // Scatter from `src` to `dst`. + eigen_assert(src_stride == 1 && dst_stride != 1); + for (; i < vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pscatter(dst + i * dst_stride, p, dst_stride); + } + if (HasHalfPacket) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { + HalfPacket p = ploadu(src + i); + pscatter(dst + i * dst_stride, p, dst_stride); + i += HalfPacketSize; + } + } + for (; i < count; ++i) { + dst[i * dst_stride] = src[i]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { + // Fill `dst` with value at `*src`. + eigen_assert(src_stride == 0 && dst_stride == 1); + + const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize)); + Scalar s = *src; + Packet p = pset1(s); + for (; i < unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i < vectorized_size; i += PacketSize) { + pstoreu(dst + i, p); + } + if (HasHalfPacket) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { + HalfPacket hp = pset1(s); + pstoreu(dst + i, hp); + i += HalfPacketSize; + } + } + for (; i < count; ++i) { + dst[i] = s; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) { + // Scatter `*src` into `dst`. + eigen_assert(src_stride == 0 && dst_stride != 1); + Scalar s = *src; + Packet p = pset1(s); + for (; i < vectorized_size; i += PacketSize) { + pscatter(dst + i * dst_stride, p, dst_stride); + } + if (HasHalfPacket) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { + HalfPacket hp = pset1(s); + pscatter(dst + i * dst_stride, hp, dst_stride); + i += HalfPacketSize; + } + } + for (; i < count; ++i) { + dst[i * dst_stride] = s; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Gather) { + // Gather from `src` into `dst`. + eigen_assert(dst_stride == 1); + for (; i < vectorized_size; i += PacketSize) { + Packet p = pgather(src + i * src_stride, src_stride); + pstoreu(dst + i, p); + } + if (HasHalfPacket) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { + HalfPacket p = pgather(src + i * src_stride, src_stride); + pstoreu(dst + i, p); + i += HalfPacketSize; + } + } + for (; i < count; ++i) { + dst[i] = src[i * src_stride]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Random) { + // Random. + for (; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + } else { + eigen_assert(false); + } + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. +// It's possible to specify src->dst dimension mapping for the copy operation. +// Dimensions of `dst` specify how many elements have to be copied, for the +// `src` we need to know only stride to navigate through source memory buffer. + +template +class TensorBlockIO { + static constexpr bool IsColMajor = (Layout == ColMajor); + + typedef StridedLinearBufferCopy LinCopy; + + public: + typedef DSizes Dimensions; + typedef DSizes DimensionsMap; + + struct Dst { + Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0) + : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + struct Src { + Src(const Dimensions& src_strides, const Scalar* src, IndexType src_offset = 0) + : strides(src_strides), data(src), offset(src_offset) {} + + Dimensions strides; + const Scalar* data; + IndexType offset; + }; + + // Copies data to `dst` from `src`, using provided dimensions mapping: + // + // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] + // + // Returns the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst& dst, const Src& src, + const DimensionsMap& dst_to_src_dim_map) { + // Copy single scalar value from `src` to `dst`. + if (NumDims == 0) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Both `dst` and `src` must have contiguous innermost dimension. We also + // accept the special case with stride '0', because it's used as a trick to + // implement broadcasting. + { + int inner_dim = IsColMajor ? 0 : NumDims - 1; + EIGEN_UNUSED_VARIABLE(inner_dim); + eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); + eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); + } + + // Give a shorter name to `dst_to_src_dim_map`. + const DimensionsMap& dim_map = dst_to_src_dim_map; + + // Do not squeeze reordered inner dimensions. + int num_squeezable_dims = NumSqueezableInnerDims(dim_map); + + // NOTE: We find the innermost dimension (contiguous in memory) in the dst + // block, and we write data linearly into that dimension, reading it from + // the src. If dimensions are reordered, we might end up reading data from + // the src with `stride != 1`. + // + // NOTE: Random-Read/Linear-Write can be up to ~2X faster than + // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 + + // Find the innermost dimension in the dst whose size is not 1. This is the + // effective inner dim. + int num_size_one_inner_dims = 0; + for (int i = 0; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + if (dst.dims[dst_dim] != 1) break; + num_size_one_inner_dims++; + } + + // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. + if (num_size_one_inner_dims == NumDims) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Outermost dimension in the dst with `stride == 1` (contiguous in memory). + const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1; + + // Dimension in the src that corresponds to the dst innermost dimension. + const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; + + // Size of the innermost dimension (length of contiguous blocks of memory). + IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; + + // Squeeze multiple inner dims into one if they are contiguous in `dst` and + // `src` memory, so we can do less linear copy calls. + for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + const IndexType dst_stride = dst.strides[dst_dim]; + const IndexType src_stride = src.strides[dim_map[dst_dim]]; + if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { + dst_inner_dim_size *= dst.dims[dst_dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + // Setup strides to read data from `src` and write to `dst`. + IndexType input_offset = src.offset; + IndexType output_offset = dst.offset; + IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; + IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array it; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int idx = 0; // currently initialized iterator state index + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; + if (dst.dims[dst_dim] == 1) continue; + + it[idx].size = dst.dims[dst_dim]; + it[idx].input_stride = src.strides[dim_map[dst_dim]]; + it[idx].output_stride = dst.strides[dst_dim]; + + it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + + idx++; + } + + // Iterate copying data from src to dst. + const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); + +#define COPY_INNER_DIM(KIND) \ + IndexType num_copied = 0; \ + for (num_copied = 0; num_copied < block_total_size; num_copied += dst_inner_dim_size) { \ + LinCopy::template Run(typename LinCopy::Dst(output_offset, output_stride, dst.data), \ + typename LinCopy::Src(input_offset, input_stride, src.data), dst_inner_dim_size); \ + \ + for (int j = 0; j < idx; ++j) { \ + if (++it[j].count < it[j].size) { \ + input_offset += it[j].input_stride; \ + output_offset += it[j].output_stride; \ + break; \ + } \ + it[j].count = 0; \ + input_offset -= it[j].input_span; \ + output_offset -= it[j].output_span; \ + } \ + } \ + return num_copied; + + if (input_stride == 1 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::Linear); + } else if (input_stride == 1 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Kind::Scatter); + } else if (input_stride == 0 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::FillLinear); + } else if (input_stride == 0 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Kind::FillScatter); + } else if (output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::Gather); + } else { + COPY_INNER_DIM(LinCopy::Kind::Random); + } + +#undef COPY_INNER_DIM + } + + // Copy from `src` to `dst` with an identity src->dst dimension map. Returns + // the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, const Src& src) { + DimensionsMap dst_to_src_map; + for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; + return Copy(dst, src, dst_to_src_map); + } + + private: + struct BlockIteratorState { + BlockIteratorState() : size(0), count(0), input_stride(0), output_stride(0), input_span(0), output_span(0) {} + + IndexType size; + IndexType count; + IndexType input_stride; + IndexType output_stride; + IndexType input_span; + IndexType output_span; + }; + + // Compute how many inner dimensions it's allowed to squeeze when doing IO + // between two tensor blocks. It's safe to squeeze inner dimensions, only + // if they are not reordered. + static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { + int num_squeezable_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + if (dim_map[dim] != dim) break; + num_squeezable_dims++; + } + return num_squeezable_dims; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to +// a Tensor block defined by `desc`, backed by a memory buffer at `target`. +// +// Currently there is no way to write from a Tensor expression to a block of +// memory, if dimensions are reordered. If you need to do that, you should +// materialize a Tensor block expression into a memory buffer, and then use +// TensorBlockIO to copy data between two memory buffers with a custom +// `target->src` dimension map (see definition above). +// +// Also currently the innermost dimension of `target` must have a stride '1' +// (contiguous in memory). This restriction could be lifted with a `pscatter`, +// but in practice it's never needed, and there is a similar TensorBlockIO +// workaround for that. +// +// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO +// where `src` is a tensor expression. Explore if it is possible to rewrite IO +// to use expressions instead of pointers, and after that TensorBlockAssignment +// will become an alias to IO. +template +class TensorBlockAssignment { + // We will use coeff/packet path to evaluate block expressions. + typedef TensorEvaluator TensorBlockEvaluator; + + typedef DSizes Dimensions; + + enum { Vectorizable = packet_traits::Vectorizable, PacketSize = packet_traits::size }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { + for (IndexType i = 0; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { + typedef typename packet_traits::type Packet; + + const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize)); + const IndexType vectorized_size = PacketSize * (count / PacketSize); + IndexType i = 0; + + for (; i < unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + const IndexType idx = eval_offset + i + j * PacketSize; + Packet p = eval.template packet(idx); + pstoreu(target + i + j * PacketSize, p); + } + } + + for (; i < vectorized_size; i += PacketSize) { + Packet p = eval.template packet(eval_offset + i); + pstoreu(target + i, p); + } + + for (; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + public: + struct Target { + Target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data, + IndexType target_offset = 0) + : dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + static Target target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data, + IndexType target_offset = 0) { + return Target(target_dims, target_strides, target_data, target_offset); + } + + template + static Target target(const DSizes& target_dims, + const DSizes& target_strides, Scalar* target_data, + IndexType target_offset = 0) { + // DSizes constructor will do index type promotion if it's safe. + return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target& target, const TensorBlockExpr& expr) { + // Prepare evaluator for block expression. + DefaultDevice default_device; + TensorBlockEvaluator eval(expr, default_device); + + // Tensor block expression dimension should match destination dimensions. + eigen_assert(dimensions_match(target.dims, eval.dimensions())); + + static const int Layout = TensorBlockEvaluator::Layout; + static const bool is_col_major = Layout == ColMajor; + + // Initialize output inner dimension size based on a layout. + const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); + const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; + IndexType output_inner_dim_size = target.dims[inner_dim_idx]; + + // Target inner dimension stride must be '1'. + eigen_assert(target.strides[inner_dim_idx] == 1); + + // Squeeze multiple inner dims into one if they are contiguous in `target`. + IndexType num_squeezed_dims = 0; + for (Index i = 1; i < NumDims; ++i) { + const Index dim = is_col_major ? i : NumDims - i - 1; + const IndexType target_stride = target.strides[dim]; + + if (output_inner_dim_size == target_stride) { + output_inner_dim_size *= target.dims[dim]; + num_squeezed_dims++; + } else { + break; + } + } + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + + int idx = 0; // currently initialized iterator state index + for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { + const Index dim = is_col_major ? i + 1 : NumDims - i - 2; + + it[idx].count = 0; + it[idx].size = target.dims[dim]; + it[idx].output_stride = target.strides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // We read block expression from the beginning, and start writing data to + // `target` at given offset. + IndexType input_offset = 0; + IndexType output_offset = target.offset; + + // Iterate copying data from `eval` to `target`. + for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { + // Assign to `target` at current offset. + InnerDimAssign::Run( + target.data + output_offset, output_inner_dim_size, eval, input_offset); + + // Move input offset forward by the number of assigned coefficients. + input_offset += output_inner_dim_size; + + // Update index. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + } + + private: + struct BlockIteratorState { + BlockIteratorState() : count(0), size(0), output_stride(0), output_span(0) {} + + IndexType count; + IndexType size; + IndexType output_stride; + IndexType output_span; + }; +}; + +// -------------------------------------------------------------------------- // + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h new file mode 100644 index 00000000000..9ef4bbce092 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -0,0 +1,1001 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H +#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorBroadcasting + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor broadcasting class. + * + * + */ +namespace internal { +template +struct traits> : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorBroadcastingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorBroadcastingOp type; +}; + +template +struct is_input_scalar { + static const bool value = false; +}; +template <> +struct is_input_scalar> { + static const bool value = true; +}; +template +struct is_input_scalar> { + static constexpr bool value = (Sizes::total_size == 1); +}; + +} // end namespace internal + +template +class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) + : m_xpr(expr), m_broadcast(broadcast) {} + + EIGEN_DEVICE_FUNC const Broadcast& broadcast() const { return m_broadcast; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Broadcast m_broadcast; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorBroadcastingOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator won't be + // standard layout; + bool isCopy, nByOne, oneByN; + + public: + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + RawAccess = false + }; + static constexpr int Layout = TensorEvaluator::Layout; + + typedef std::remove_const_t ScalarNoConst; + + // We do block based broadcasting using a trick with 2x tensor rank and 0 + // strides. See block method implementation for details. + typedef DSizes BroadcastDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : isCopy(false), + nByOne(false), + oneByN(false), + m_device(device), + m_broadcast(op.broadcast()), + m_impl(op.expression(), device) { + // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar + // and store the result in a scalar. Instead one should reshape the scalar into a N-D + // tensor with N >= 1 of 1 element first and then broadcast. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + const InputDimensions& input_dims = m_impl.dimensions(); + isCopy = true; + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i] * m_broadcast[i]; + if (m_broadcast[i] != 1) { + isCopy = false; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + + if (input_dims[0] == 1) { + oneByN = true; + for (int i = 1; i < NumDims; ++i) { + if (m_broadcast[i] != 1) { + oneByN = false; + break; + } + } + } else if (input_dims[NumDims - 1] == 1) { + nByOne = true; + for (int i = 0; i < NumDims - 1; ++i) { + if (m_broadcast[i] != 1) { + nByOne = false; + break; + } + } + } + + // Handle special format like NCHW, its input shape is '[1, N..., 1]' and + // broadcast shape is '[N, 1..., N]' + if (!oneByN && !nByOne) { + if (input_dims[0] == 1 && input_dims[NumDims - 1] == 1 && NumDims > 2) { + nByOne = true; + oneByN = true; + for (int i = 1; i < NumDims - 1; ++i) { + if (m_broadcast[i] != 1) { + nByOne = false; + oneByN = false; + break; + } + } + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { + if (internal::is_input_scalar>::value) { + return m_impl.coeff(0); + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + if (isCopy) { + return m_impl.coeff(index); + } else { + return coeffColMajor(index); + } + } else { + if (isCopy) { + return m_impl.coeff(index); + } else { + return coeffRowMajor(index); + } + } + } + + // TODO: attempt to speed this up. The integer divisions and modulo are slow + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + inputIndex += index; + } else { + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const { + return m_impl.coeff(indexColMajor(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims - 1]); + inputIndex += index; + } else { + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[NumDims - 1]); + } + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const { + return m_impl.coeff(indexRowMajor(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + if (internal::is_input_scalar>::value) { + return internal::pset1(m_impl.coeff(0)); + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + if (isCopy) { +#ifdef EIGEN_GPU_COMPILE_PHASE + // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing + // unaligned loads here. The reason is unclear though. + return m_impl.template packet(index); +#else + return m_impl.template packet(index); +#endif + } else if (oneByN && !nByOne) { + return packetNByOne(index); + } else if (!oneByN && nByOne) { + return packetOneByN(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); + } else { + return packetColMajor(index); + } + } else { + if (isCopy) { +#ifdef EIGEN_GPU_COMPILE_PHASE + // See above. + return m_impl.template packet(index); +#else + return m_impl.template packet(index); +#endif + } else if (oneByN && !nByOne) { + return packetOneByN(index); + } else if (!oneByN && nByOne) { + return packetNByOne(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); + } else { + return packetRowMajor(index); + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + Index startDim, endDim; + Index inputIndex, outputOffset, batchedIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + startDim = NumDims - 1; + endDim = 1; + } else { + startDim = 0; + endDim = NumDims - 2; + } + + batchedIndex = index % m_outputStrides[startDim]; + inputIndex = batchedIndex / m_outputStrides[endDim]; + outputOffset = batchedIndex % m_outputStrides[endDim]; + + if (outputOffset + PacketSize <= m_outputStrides[endDim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[endDim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + ++inputIndex; + inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex); + values[i] = m_impl.coeff(inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const { + // Consider the flattened tensor [v0, ..., vN], + // Concatenates m_broadcast[dim] copies, + // [v0, ..., vN, v0, ..., vN, ... ] + // with dim == NumDims - 1 for col-major, dim == 0 for row-major. + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + // Size of flattened tensor. + const Index M = + (static_cast(Layout) == static_cast(ColMajor)) ? m_inputStrides[NumDims - 1] : m_inputStrides[0]; + Index inputIndex = index % M; + if (inputIndex + PacketSize <= M) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + if (inputIndex > M - 1) { + inputIndex = 0; + } + values[i] = m_impl.coeff(inputIndex++); + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const { + // Consider the flattened tensor [v0, ..., vN], + // Interleaves m_broadcast[dim] copies, + // [v0, v0, ..., v1, v1, ..., vN, vN, ... ] + // with dim == 0 for col-major, dim == NumDims - 1 for row-major. + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + const Index M = + (static_cast(Layout) == static_cast(ColMajor)) ? m_broadcast[0] : m_broadcast[NumDims - 1]; + + Index inputIndex = index / M; + Index outputOffset = index % M; + if (outputOffset + PacketSize <= M) { + return internal::pset1(m_impl.coeff(inputIndex)); + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + if (outputOffset < M) { + values[i] = m_impl.coeff(inputIndex); + ++outputOffset; + } else { + values[i] = m_impl.coeff(++inputIndex); + outputOffset = 1; // Next offset. + } + } + return internal::pload(values); + } + } + + // Ignore the LoadMode and always use unaligned loads since we can't guarantee + // the alignment at compile time. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + innermostLoc = index; + } else { + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize; ++i) { + if (innermostLoc + i < m_impl.dimensions()[0]) { + values[i] = m_impl.coeff(inputIndex + i); + } else { + values[i] = coeffColMajor(originalIndex + i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims - 1]); + innermostLoc = index; + } else { + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[NumDims - 1]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims - 1]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize; ++i) { + if (innermostLoc + i < m_impl.dimensions()[NumDims - 1]) { + values[i] = m_impl.coeff(inputIndex + i); + } else { + values[i] = coeffRowMajor(originalIndex + i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = TensorOpCost::AddCost(); + if (!isCopy && NumDims > 0) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + compute_cost += TensorOpCost::DivCost(); + if (internal::index_statically_eq(i, 1)) { + compute_cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else { + if (!internal::index_statically_eq(i, 1)) { + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::ModCost() + TensorOpCost::AddCost(); + } + } + compute_cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large + // tensors. But this might need further tuning. + const size_t target_size = m_device.firstLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + m_impl.getResourceRequirements(), internal::TensorBlockResourceRequirements::skewed(target_size)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + BlockBroadcastingParams params = blockBroadcastingParams(desc); + + if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) { + return emptyBlock(); + } + + // Prepare storage for the materialized broadcasting result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + ScalarNoConst* materialized_output = block_storage.data(); + + // We potentially will need to materialize input blocks. + size_t materialized_input_size = 0; + ScalarNoConst* materialized_input = NULL; + + // Initialize block broadcating iterator state for outer dimensions (outer + // with regard to bcast dimension). Dimension in this array are always in + // inner_most -> outer_most order (col major layout). + array it; + int idx = 0; + + for (int i = params.inner_dim_count + 1; i < NumDims; ++i) { + const Index dim = IsColMajor ? i : NumDims - 1 - i; + it[idx].size = params.output_dims[dim]; + it[idx].count = 0; + it[idx].output_stride = m_outputStrides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // Write output into the beginning of `materialized_output`. + Index output_offset = 0; + + // We will fill output block by broadcasting along the bcast dim, and + // iterating over outer dimension. + const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize(); + + for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) { + ScalarNoConst* bcast_output = materialized_output + num_output_coeffs; + Index bcast_offset = desc.offset() + output_offset; + + // Broadcast along the bcast dimension. + num_output_coeffs += BroadcastBlockAlongBcastDim(params, bcast_offset, scratch, bcast_output, &materialized_input, + &materialized_input_size); + + // Switch to the next outer dimension. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + const TensorEvaluator& impl() const { return m_impl; } + + Broadcast functor() const { return m_broadcast; } + + private: + static constexpr bool IsColMajor = static_cast(Layout) == static_cast(ColMajor); + + // We will build a general case block broadcasting on top of broadcasting + // primitive that will do broadcasting only for the inner dimension(s) along + // the first dimension smaller than the input size (it's called `bcast_dim`). + // + // Example: + // dim: 0 1 2 (ColMajor) + // input size: [9, 3, 6] + // block size: [9, 2, 6] + // + // We will compute broadcasted block by iterating over the outer dimensions + // before `bcast_dim` (only dimension `2` in this example) and computing + // broadcasts along the `bcast_dim` (dimension `1` in this example). + + // BlockBroadcastingParams holds precomputed parameters for broadcasting a + // single block along the broadcasting dimension. Sizes and strides along the + // `bcast_dim` might be invalid, they will be adjusted later in + // `BroadcastBlockAlongBcastDim`. + struct BlockBroadcastingParams { + Dimensions input_dims; // input expression dimensions + Dimensions output_dims; // output block sizes + Dimensions output_strides; // output block strides + + int inner_dim_count; // count inner dimensions matching in size + int bcast_dim; // broadcasting dimension index + Index bcast_dim_size; // broadcasting dimension size + Index inner_dim_size; // inner dimensions size + + // Block sizes and strides for the input block where all dimensions before + // `bcast_dim` are equal to `1`. + Dimensions input_block_sizes; + Dimensions input_block_strides; + + // Block sizes and strides for blocks with extra dimensions and strides `0`. + BroadcastDimensions bcast_block_sizes; + BroadcastDimensions bcast_block_strides; + BroadcastDimensions bcast_input_strides; + }; + + struct BlockBroadcastingIteratorState { + Index size; + Index count; + Index output_stride; + Index output_span; + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc& desc) const { + BlockBroadcastingParams params; + + params.input_dims = Dimensions(m_impl.dimensions()); + + // Output block sizes and strides. + params.output_dims = desc.dimensions(); + params.output_strides = internal::strides(params.output_dims); + + // Find the broadcasting dimension (first dimension with output size smaller + // that the input size). + params.bcast_dim = 0; + params.bcast_dim_size = 1; + params.inner_dim_size = 1; + + // Count the number of inner dimensions that have the same size in the block + // and in the broadcast expression. + params.inner_dim_count = 0; + + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + + if (params.output_dims[dim] == m_dimensions[dim]) { + params.inner_dim_size *= params.output_dims[dim]; + ++params.inner_dim_count; + continue; + } + + // First non-matching dimension is the broadcasting dimension. + eigen_assert(params.output_dims[dim] < m_dimensions[dim]); + params.bcast_dim = dim; + params.bcast_dim_size = params.output_dims[dim]; + break; + } + + // Calculate the input block size for looking into the input. + for (int i = 0; i < params.inner_dim_count; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + params.input_block_sizes[dim] = params.input_dims[dim]; + } + for (int i = params.inner_dim_count; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + params.input_block_sizes[dim] = 1; + } + params.input_block_strides = internal::strides(params.input_block_sizes); + + // Broadcast with the 0-stride trick: Create 1 extra dim for each + // broadcast, set the input stride to 0. + // + // When ColMajor: + // + // - bcast_block_sizes: + // [d_0, b_0, d_1, b_1, ...] + // + // - bcast_block_strides: + // [output_block_strides[0], output_block_strides[0] * d_0, + // output_block_strides[1], output_block_strides[1] * d_1, + // ...] + // + // - bcast_input_strides: + // [input_block_strides[0], 0, + // input_block_strides[1], 0, + // ...]. + // + for (int i = 0; i < params.inner_dim_count; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + + const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1; + const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1; + + params.bcast_block_sizes[copy_dim] = params.input_dims[dim]; + params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim]; + params.bcast_block_strides[copy_dim] = params.output_strides[dim]; + params.bcast_block_strides[broadcast_dim] = params.output_strides[dim] * params.input_dims[dim]; + params.bcast_input_strides[copy_dim] = params.input_block_strides[dim]; + params.bcast_input_strides[broadcast_dim] = 0; + } + + for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) { + const int dim = IsColMajor ? i : 2 * NumDims - i - 1; + params.bcast_block_sizes[dim] = 1; + params.bcast_block_strides[dim] = 0; + params.bcast_input_strides[dim] = 0; + } + + return params; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const { + DSizes dimensions; + for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; + return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim( + BlockBroadcastingParams params, Index bcast_offset, TensorBlockScratch& scratch, + ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const { + if (params.bcast_dim_size == 1) { + // We just need one block read using the ready-set values above. + return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + + } else if (params.input_dims[params.bcast_dim] == 1) { + // Broadcast bcast dimension (< NumDims) by bcast_dim_size. + const int broadcast_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2; + + params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = params.output_strides[params.bcast_dim]; + + return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + + } else { + // Keep track of the total number of the coefficients written to the + // output block. + Index num_output_coeffs = 0; + + // The general case. Let's denote the output block as + // + // x[..., a:a+bcast_dim_size, :, ..., :] + // + // where a:a+bcast_dim_size is a slice on the bcast_dim dimension + // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3 + // sub-blocks: + // + // (1) a:b, where b is the smallest multiple of + // input_dims[bcast_dim_start] in [a, a+bcast_dim_size]. + // + // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start] + // in [a, a+bcast_dim_size]. + // + // (3) c:a+bcast_dim_size . + // + // Or, when b and c do not exist, we just need to process the whole block + // together. + + // Find a. + const Index bcast_dim_left_index = bcast_offset / m_outputStrides[params.bcast_dim]; + + // Find b and c. + const Index input_bcast_dim_size = params.input_dims[params.bcast_dim]; + + // First multiple after a. This is b when <= bcast_dim_left_index + + // bcast_dim_size. + const Index first_multiple = + numext::div_ceil(bcast_dim_left_index, input_bcast_dim_size) * input_bcast_dim_size; + + if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) { + // b exists, so does c. Find it. + const Index last_multiple = + (bcast_dim_left_index + params.bcast_dim_size) / input_bcast_dim_size * input_bcast_dim_size; + const int copy_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1; + const int broadcast_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2; + + if (first_multiple > bcast_dim_left_index) { + const Index head_size = first_multiple - bcast_dim_left_index; + params.input_block_sizes[params.bcast_dim] = head_size; + params.bcast_block_sizes[copy_bcast_dim] = head_size; + params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = 1; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim]; + + num_output_coeffs += + BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + } + if (first_multiple < last_multiple) { + params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size; + params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size; + params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = (last_multiple - first_multiple) / input_bcast_dim_size; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim]; + const Index offset = (first_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim]; + + num_output_coeffs += + BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch, + materialized_output, materialized_input, materialized_input_size); + } + if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) { + const Index tail_size = bcast_dim_left_index + params.bcast_dim_size - last_multiple; + params.input_block_sizes[params.bcast_dim] = tail_size; + params.bcast_block_sizes[copy_bcast_dim] = tail_size; + params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = 1; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim]; + const Index offset = (last_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim]; + + num_output_coeffs += + BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch, + materialized_output, materialized_input, materialized_input_size); + } + } else { + // b and c do not exist. + const int copy_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1; + params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size; + params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size; + params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim]; + + num_output_coeffs += + BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + } + + return num_output_coeffs; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock( + const Dimensions& input_block_sizes, const Dimensions& input_block_strides, + const BroadcastDimensions& bcast_block_sizes, const BroadcastDimensions& bcast_block_strides, + const BroadcastDimensions& bcast_input_strides, Index bcast_offset, Index offset, TensorBlockScratch& scratch, + ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const { + // ---------------------------------------------------------------------- // + // Tensor block descriptor for reading block from the input. + const Index input_offset = bcast_offset + offset; + TensorBlockDesc input_desc(IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset), + input_block_sizes); + + ArgTensorBlock input_block = m_impl.block(input_desc, scratch); + + // ---------------------------------------------------------------------- // + // Materialize input block into a temporary memory buffer only if it's not + // already available in the arg block. + const ScalarNoConst* input_buffer = NULL; + + if (input_block.data() != NULL) { + // Input block already has raw data, there is no need to materialize it. + input_buffer = input_block.data(); + + } else { + // Otherwise we have to do block assignment into a temporary buffer. + + // Maybe reuse previously allocated buffer, or allocate a new one with a + // scratch allocator. + const size_t input_total_size = input_block_sizes.TotalSize(); + if (*materialized_input == NULL || *materialized_input_size < input_total_size) { + *materialized_input_size = input_total_size; + void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar)); + *materialized_input = static_cast(mem); + } + + typedef internal::TensorBlockAssignment + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target(input_block_sizes, input_block_strides, *materialized_input), + input_block.expr()); + + input_buffer = *materialized_input; + } + + // ---------------------------------------------------------------------- // + // Copy data from materialized input block to the materialized output, using + // given broadcast strides (strides with zeroes). + typedef internal::TensorBlockIO TensorBlockIO; + + typename TensorBlockIO::Src src(bcast_input_strides, input_buffer); + typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, materialized_output + offset); + + return TensorBlockIO::Copy(dst, src); + } + + protected: + const Device EIGEN_DEVICE_REF m_device; + const std::remove_reference_t m_broadcast; + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h new file mode 100644 index 00000000000..5db563d601b --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -0,0 +1,474 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorKChippingReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. + * + * + */ + +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions - 1; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorChippingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorChippingOp type; +}; + +template +struct DimensionId { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + EIGEN_UNUSED_VARIABLE(dim); + eigen_assert(dim == DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { return DimId; } +}; +template <> +struct DimensionId { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { eigen_assert(dim >= 0); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { return actual_dim; } + + private: + const DenseIndex actual_dim; +}; + +} // end namespace internal + +template +class TensorChippingOp : public TensorBase > { + public: + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + eigen_assert(dim < XprType::NumDimensions && dim >= 0 && "Chip_Dim_out_of_range"); + } + + EIGEN_DEVICE_FUNC const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC const Index dim() const { return m_dim.actualDim(); } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp) + + protected: + typename XprType::Nested m_xpr; + const Index m_offset; + const internal::DimensionId m_dim; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorChippingOp XprType; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + static constexpr int NumDims = NumInputDims - 1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static constexpr int Layout = TensorEvaluator::Layout; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets. + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + // Chipping of outer-most dimension is a trivial operation, because we can + // read and write directly from the underlying tensor using single offset. + IsOuterChipping = (Layout == ColMajor && DimId == NumInputDims - 1) || (Layout == RowMajor && DimId == 0), + // Chipping inner-most dimension. + IsInnerChipping = (Layout == ColMajor && DimId == 0) || (Layout == RowMajor && DimId == NumInputDims - 1), + // Prefer block access if the underlying expression prefers it, otherwise + // only if chipping is not trivial. + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || !IsOuterChipping, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef internal::TensorBlockDescriptor ArgTensorBlockDesc; + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { + EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(NumInputDims > m_dim.actualDim()); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); + + int j = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (i != m_dim.actualDim()) { + m_dimensions[j] = input_dims[i]; + ++j; + } + } + + m_stride = 1; + m_inputStride = 1; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < m_dim.actualDim(); ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } else { + for (int i = NumInputDims - 1; i > m_dim.actualDim(); --i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } + m_inputStride *= input_dims[m_dim.actualDim()]; + m_inputOffset = m_stride * op.offset(); + + // Check if chipping is effectively inner or outer: products of dimensions + // before or after the chipped dimension is `1`. + Index after_chipped_dim_product = 1; + for (int i = static_cast(m_dim.actualDim()) + 1; i < NumInputDims; ++i) { + after_chipped_dim_product *= input_dims[i]; + } + + Index before_chipped_dim_product = 1; + for (int i = 0; i < m_dim.actualDim(); ++i) { + before_chipped_dim_product *= input_dims[i]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_isEffectivelyInnerChipping = before_chipped_dim_product == 1; + m_isEffectivelyOuterChipping = after_chipped_dim_product == 1; + } else { + m_isEffectivelyInnerChipping = after_chipped_dim_product == 1; + m_isEffectivelyOuterChipping = before_chipped_dim_product == 1; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_impl.coeff(srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + if (isInnerChipping()) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + Index inputIndex = index * m_inputStride + m_inputOffset; + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = m_impl.coeff(inputIndex); + inputIndex += m_inputStride; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } else if (isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + return m_impl.template packet(index + m_inputOffset); + } else { + const Index idx = index / m_stride; + const Index rem = index - idx * m_stride; + if (rem + PacketSize <= m_stride) { + Index inputIndex = idx * m_inputStride + m_inputOffset + rem; + return m_impl.template packet(inputIndex); + } else { + // Cross the stride boundary. Fallback to slow path. + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index); + ++index; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double cost = 0; + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) { + cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + cost += TensorOpCost::AddCost(); + } else { + cost += 3 * TensorOpCost::MulCost() + TensorOpCost::DivCost() + 3 * TensorOpCost::AddCost(); + } + + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + const Index chip_dim = m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) : i > chip_dim ? desc.dimension(i - 1) : 1; + } + + ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims); + + // Try to reuse destination buffer for materializing argument block. + if (desc.HasDestinationBuffer()) { + DSizes arg_destination_strides; + for (int i = 0; i < NumInputDims; ++i) { + arg_destination_strides[i] = i < chip_dim ? desc.destination().strides()[i] + : i > chip_dim ? desc.destination().strides()[i - 1] + : 0; // for dimensions of size `1` stride should never be used. + } + + arg_desc.template AddDestinationBuffer(desc.destination().template data(), + arg_destination_strides); + } + + ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + + if (arg_block.data() != NULL) { + // Forward argument block buffer if possible. + return TensorBlock(arg_block.kind(), arg_block.data(), desc.dimensions()); + + } else { + // Assign argument block expression to a buffer. + + // Prepare storage for the materialized chipping result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + + typedef internal::TensorBlockAssignment + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target(arg_desc.dimensions(), internal::strides(arg_desc.dimensions()), + block_storage.data()), + arg_block.expr()); + + return block_storage.AsTensorMaterializedBlock(); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); + if (isOuterChipping() && result) { + return result + m_inputOffset; + } else { + return NULL; + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex; + if (isInnerChipping()) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + inputIndex = index * m_inputStride + m_inputOffset; + } else if (isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer + // division. + eigen_assert(m_stride > index); + inputIndex = index + m_inputOffset; + } else { + const Index idx = index / m_stride; + inputIndex = idx * m_inputStride + m_inputOffset; + index -= idx * m_stride; + inputIndex += index; + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const { + return IsInnerChipping || m_isEffectivelyInnerChipping; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const { + return IsOuterChipping || m_isEffectivelyOuterChipping; + } + + Dimensions m_dimensions; + Index m_stride; + Index m_inputOffset; + Index m_inputStride; + TensorEvaluator m_impl; + const internal::DimensionId m_dim; + const Device EIGEN_DEVICE_REF m_device; + + // If product of all dimensions after or before the chipped dimension is `1`, + // it is effectively the same as chipping innermost or outermost dimension. + bool m_isEffectivelyInnerChipping; + bool m_isEffectivelyOuterChipping; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorChippingOp XprType; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + static constexpr int NumDims = NumInputDims - 1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + if (this->isInnerChipping()) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(this->m_stride == 1); + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + internal::pstore(values, x); + Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->m_impl.coeffRef(inputIndex) = values[i]; + inputIndex += this->m_inputStride; + } + } else if (this->isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer division. + eigen_assert(this->m_stride > index); + this->m_impl.template writePacket(index + this->m_inputOffset, x); + } else { + const Index idx = index / this->m_stride; + const Index rem = index - idx * this->m_stride; + if (rem + PacketSize <= this->m_stride) { + const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; + this->m_impl.template writePacket(inputIndex, x); + } else { + // Cross stride boundary. Fallback to slow path. + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index) = values[i]; + ++index; + } + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + + const Index chip_dim = this->m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) : i > chip_dim ? desc.dimension(i - 1) : 1; + } + + typedef TensorReshapingOp, const typename TensorBlock::XprType> TensorBlockExpr; + + typedef internal::TensorBlockAssignment TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target(input_block_dims, internal::strides(this->m_impl.dimensions()), + this->m_impl.data(), this->srcCoeff(desc.offset())), + block.expr().reshape(input_block_dims)); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h new file mode 100644 index 00000000000..e9c130dceb2 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -0,0 +1,354 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorConcatenationOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor concatenation class. + * + * + */ +namespace internal { +template +struct traits > { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = traits::Layout; + enum { Flags = 0 }; + typedef std::conditional_t::val, + typename traits::PointerType, typename traits::PointerType> + PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorConcatenationOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorConcatenationOp type; +}; + +} // end namespace internal + +template +class TensorConcatenationOp : public TensorBase, WriteAccessors> { + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& lhsExpression() const { + return m_lhs_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& rhsExpression() const { + return m_rhs_xpr; + } + + EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp) + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Axis m_axis; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorConcatenationOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + static constexpr int RightNumDims = + internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = + TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout) || + NumDims == 1), + YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(0 <= m_axis && m_axis < NumDims); + const Dimensions& lhs_dims = m_leftImpl.dimensions(); + const Dimensions& rhs_dims = m_rightImpl.dimensions(); + { + int i = 0; + for (; i < m_axis; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. + eigen_assert(rhs_dims[i] > 0); + m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; + for (++i; i < NumDims; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_leftStrides[0] = 1; + m_rightStrides[0] = 1; + m_outputStrides[0] = 1; + + for (int j = 1; j < NumDims; ++j) { + m_leftStrides[j] = m_leftStrides[j - 1] * lhs_dims[j - 1]; + m_rightStrides[j] = m_rightStrides[j - 1] * rhs_dims[j - 1]; + m_outputStrides[j] = m_outputStrides[j - 1] * m_dimensions[j - 1]; + } + } else { + m_leftStrides[NumDims - 1] = 1; + m_rightStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + + for (int j = NumDims - 2; j >= 0; --j) { + m_leftStrides[j] = m_leftStrides[j + 1] * lhs_dims[j + 1]; + m_rightStrides[j] = m_rightStrides[j + 1] * rhs_dims[j + 1]; + m_outputStrides[j] = m_outputStrides[j + 1] * m_dimensions[j + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. + // See CL/76180724 comments for more ideas. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + // Collect dimension-wise indices (subs). + array subs; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[NumDims - 1] = index; + } + + const Dimensions& left_dims = m_leftImpl.dimensions(); + if (subs[m_axis] < left_dims[m_axis]) { + Index left_index; + if (static_cast(Layout) == static_cast(ColMajor)) { + left_index = subs[0]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } else { + left_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i >= 0; --i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } + return m_leftImpl.coeff(left_index); + } else { + subs[m_axis] -= left_dims[m_axis]; + const Dimensions& right_dims = m_rightImpl.dimensions(); + Index right_index; + if (static_cast(Layout) == static_cast(ColMajor)) { + right_index = subs[0]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } else { + right_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i >= 0; --i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } + return m_rightImpl.coeff(right_index); + } + } + + // TODO(phli): Add a real vectorization. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + const int packetSize = PacketType::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost() + TensorOpCost::ModCost()); + const double lhs_size = m_leftImpl.dimensions().TotalSize(); + const double rhs_size = m_rightImpl.dimensions().TotalSize(); + return (lhs_size / (lhs_size + rhs_size)) * m_leftImpl.costPerCoeff(vectorized) + + (rhs_size / (lhs_size + rhs_size)) * m_rightImpl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_leftStrides; + array m_rightStrides; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; + const Axis m_axis; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorConcatenationOp XprType; + typedef typename Base::Dimensions Dimensions; + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = + TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) : Base(op, device) { + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + // Collect dimension-wise indices (subs). + array subs; + for (int i = Base::NumDims - 1; i > 0; --i) { + subs[i] = index / this->m_outputStrides[i]; + index -= subs[i] * this->m_outputStrides[i]; + } + subs[0] = index; + + const Dimensions& left_dims = this->m_leftImpl.dimensions(); + if (subs[this->m_axis] < left_dims[this->m_axis]) { + Index left_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i]; + } + return this->m_leftImpl.coeffRef(left_index); + } else { + subs[this->m_axis] -= left_dims[this->m_axis]; + const Dimensions& right_dims = this->m_rightImpl.dimensions(); + Index right_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i]; + } + return this->m_rightImpl.coeffRef(right_index); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + const int packetSize = PacketType::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); + + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + coeffRef(index + i) = values[i]; + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h new file mode 100644 index 00000000000..6f1c9744ee0 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -0,0 +1,962 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorContraction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor contraction class. + * + * + */ +namespace internal { + +template +struct traits> { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename gebp_traits, + std::remove_const_t>::ResScalar Scalar; + + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + + // From NumDims below. + static constexpr int NumDimensions = + traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; + static constexpr int Layout = traits::Layout; + typedef std::conditional_t::val, + typename traits::PointerType, typename traits::PointerType> + PointerType; + + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorContractionOp& type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorContractionOp type; +}; + +template +struct traits< + TensorEvaluator, Device_>> { + typedef Indices_ Indices; + typedef LeftArgType_ LeftArgType; + typedef RightArgType_ RightArgType; + typedef OutputKernelType_ OutputKernelType; + typedef Device_ Device; + + // From NumDims below. + static constexpr int NumDimensions = + traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; +}; + +// Helper class to allocate and deallocate temporary memory for packed buffers. +template +struct TensorContractionBlockMemAllocator { + typedef void* BlockMemHandle; + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, const Index bk, const Index bn, + LhsScalar** lhs_block, RhsScalar** rhs_block) { + eigen_assert(lhs_block); + eigen_assert(rhs_block); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + char* block_mem = static_cast(d.allocate(sz.lhs_size + sz.rhs_size)); + *lhs_block = static_cast(static_cast(block_mem)); + *rhs_block = static_cast(static_cast(block_mem + sz.lhs_size)); + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices(Device& d, const Index bm, const Index bk, const Index bn, + const Index num_lhs, const Index num_rhs, + const Index num_slices, std::vector* lhs_blocks, + std::vector* rhs_blocks) { + eigen_assert(num_slices > 0); + eigen_assert(num_lhs >= 0 && num_rhs >= 0); + eigen_assert(num_lhs == 0 || lhs_blocks); + eigen_assert(num_rhs == 0 || rhs_blocks); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + void* block_mem = d.allocate((num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices); + eigen_assert(block_mem); + char* mem = static_cast(block_mem); + + for (Index x = 0; x < num_slices; x++) { + if (num_lhs > 0) lhs_blocks[x].resize(num_lhs); + for (Index m = 0; m < num_lhs; m++) { + lhs_blocks[x][m] = static_cast(static_cast(mem)); + mem += sz.lhs_size; + } + if (num_rhs > 0) rhs_blocks[x].resize(num_rhs); + for (Index n = 0; n < num_rhs; n++) { + rhs_blocks[x][n] = static_cast(static_cast(mem)); + mem += sz.rhs_size; + } + } + + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + d.deallocate(handle); + } + + private: + struct BlockSizes { + Index lhs_size; + Index rhs_size; + }; + EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, const Index bk, const Index bn) { + Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + BlockSizes sz; + sz.lhs_size = numext::div_ceil(bm * bk * sizeof(LhsScalar), align) * align; + sz.rhs_size = numext::div_ceil(bn * bk * sizeof(RhsScalar), align) * align; + return sz; + } +}; + +// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in +// ColMajor storage order. This property is guaranteed by the +// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack +// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix +// multiplication for these blocks. Default tensor contraction uses +// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see +// GeneralBlocPanelKernel.h for details). +// +// By specializing contraction kernels we can use other low level libraries to +// perform matrix multiplication, and still rely on Eigen contraction evaluator. +// This also includes full support in TensorContractionThreadPool, assuming that +// underlying gemm do not use it's own threading. +// +// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of +// multiplication, lhs tensor and rhs tensor respectively. +// +// - StorageIndex - index type for the tensor expressions. In practice almost +// always is Eigen::Index. +// +// - OutputMapper provides access to the memory of the output matrix. In +// practice it's always column major blas_data_mapper (it must be of ResScalar +// type). +// +// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional +// view into the Lhs/Rhs tensor expressions. In practice it's +// TensorContractionInputMapper, or some specialization of it based on the +// type of tensor expression (e.g. TensorImagePatchOp has optimized input +// mapper). +template +struct TensorContractionKernel { + // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C` + // (otherwise beta should be always equal to 1). + enum { HasBeta = false }; + + EIGEN_DEVICE_FUNC TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_, StorageIndex bm_, + StorageIndex bk_, StorageIndex bn_) + : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {} + + // Pack blocks of Lhs and Rhs into contiguous blocks in memory. + typedef LhsScalar* LhsBlock; + typedef RhsScalar* RhsBlock; + + // Packed Lhs/Rhs block memory allocator. + typedef TensorContractionBlockMemAllocator BlockMemAllocator; + typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle; + + typedef typename internal::gebp_traits Traits; + + typedef internal::gemm_pack_lhs + LhsPacker; + + typedef internal::gemm_pack_rhs + RhsPacker; + + typedef internal::gebp_kernel + GebpKernel; + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, RhsBlock* rhs_block) { + return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block); + } + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices(Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs, + const StorageIndex num_slices, std::vector* lhs_blocks, + std::vector* rhs_blocks) { + return BlockMemAllocator::allocateSlices(d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks); + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + BlockMemAllocator::deallocate(d, handle); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0, + /*offset*/ 0); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(*rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(const OutputMapper& output_mapper, const LhsBlock& lhsBlock, + const RhsBlock& rhsBlock, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + const ResScalar alpha, const ResScalar beta) { + // Default GEBP kernel does not support beta. + eigen_assert(beta == ResScalar(1)); + static const int kComputeStrideFromBlockDimensions = -1; + GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, + /*strideA*/ kComputeStrideFromBlockDimensions, + /*strideB*/ kComputeStrideFromBlockDimensions, + /*offsetA*/ 0, /*offsetB*/ 0); + } + + private: + // These are dimensions of the original Tensors, and selected block sizes. The + // actual block sizes passed to all function above might be smaller because of + // the partial blocks at the end. + const StorageIndex m; + const StorageIndex k; + const StorageIndex n; + const StorageIndex bm; + const StorageIndex bk; + const StorageIndex bn; +}; + +} // end namespace internal + +// Tensor contraction params that should enable to get from output matrix +// 2-dimensional coordinates to the output tensor dimensions. +struct TensorContractionParams { + // TensorContraction evaluator assumes that both tensors are in ColMajor + // layout, if tensors are in RowMajor evaluator swap lhs with rhs. + bool swapped_arguments; +}; + +// Output kernel allows to fuse operations into the tensor contraction. +// +// Examples: +// 1. Elementwise Relu transformation following Conv2D. +// 2. AddBias to the Conv2D output channels dimension. +// +// The NoOpOutputKernel implements an output kernel that does absolutely nothing. +struct NoOpOutputKernel { + /** + * Tensor contraction evaluator calls this kernel after finishing each block + * of output matrix. Output blocks belong to the 2-dimensional output tensor. + * + * TensorContractionParams contains contraction dimensions information + * required to map output 2-d space into the expected output tensor space + * (potentially higher dimensional). + * + * \param[in] output_mapper Access to output tensor memory + * \param[in] params Tensor contraction parameters + * \param[in] i Index of a first row available through output_mapper + * \param[in] j Index of a first column available through output_mapper + * \param[in] num_rows Number of available rows + * \param[in] num_cols Number of available columns + */ + template + EIGEN_ALWAYS_INLINE void operator()(const internal::blas_data_mapper& output_mapper, + const TensorContractionParams& params, Index i, Index j, Index num_rows, + Index num_cols) const { + EIGEN_UNUSED_VARIABLE(output_mapper); + EIGEN_UNUSED_VARIABLE(params); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(num_rows); + EIGEN_UNUSED_VARIABLE(num_cols); + } +}; + +template +class TensorContractionOp + : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename internal::gebp_traits::ResScalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs, + const Indices& dims, + const OutputKernelType& output_kernel = OutputKernelType()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims), m_output_kernel(output_kernel) {} + + EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC const internal::remove_all_t& lhsExpression() const { + return m_lhs_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& rhsExpression() const { + return m_rhs_xpr; + } + + EIGEN_DEVICE_FUNC const OutputKernelType& outputKernel() const { return m_output_kernel; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Indices m_indices; + const OutputKernelType m_output_kernel; +}; + +template +struct TensorContractionEvaluatorBase { + typedef typename internal::traits::Indices Indices; + typedef typename internal::traits::LeftArgType LeftArgType; + typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::OutputKernelType OutputKernelType; + typedef typename internal::traits::Device Device; + + typedef TensorContractionOp XprType; + typedef std::remove_const_t Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef std::conditional_t(Layout) == static_cast(ColMajor), LeftArgType, RightArgType> + EvalLeftArgType; + typedef std::conditional_t(Layout) == static_cast(ColMajor), RightArgType, LeftArgType> + EvalRightArgType; + + typedef TensorEvaluator LeftEvaluatorType; + typedef TensorEvaluator RightEvaluatorType; + + static constexpr int LDims = + internal::array_size::Dimensions>::value; + static constexpr int RDims = + internal::array_size::Dimensions>::value; + static constexpr int ContractDims = internal::array_size::value; + static constexpr int NumDims = LDims + RDims - 2 * ContractDims; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + typedef DSizes Dimensions; + + EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), + op.rhsExpression()), + device), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.rhsExpression(), + op.lhsExpression()), + device), + m_device(device), + m_output_kernel(op.outputKernel()), + m_result(NULL) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + DSizes eval_left_dims; + DSizes eval_right_dims; + array, ContractDims> eval_op_indices; + if (static_cast(Layout) == static_cast(ColMajor)) { + // For ColMajor, we keep using the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[i]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[i]; + } + // We keep the pairs of contracting indices. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = op.indices()[i].first; + eval_op_indices[i].second = op.indices()[i].second; + } + } else { + // For RowMajor, we need to reverse the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; + } + // We need to flip all the pairs of contracting indices as well as + // reversing the dimensions. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; + } + } + + // Check for duplicate axes and make sure the first index in eval_op_indices + // is increasing. Using O(n^2) sorting is OK since ContractDims is small + for (int i = 0; i < ContractDims; i++) { + for (int j = i + 1; j < ContractDims; j++) { + eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first && + eval_op_indices[j].second != eval_op_indices[i].second && "contraction axes should be unique"); + if (eval_op_indices[j].first < eval_op_indices[i].first) { + numext::swap(eval_op_indices[j], eval_op_indices[i]); + } + } + } + + array lhs_strides; + lhs_strides[0] = 1; + for (int i = 0; i < LDims - 1; ++i) { + lhs_strides[i + 1] = lhs_strides[i] * eval_left_dims[i]; + } + + array rhs_strides; + rhs_strides[0] = 1; + for (int i = 0; i < RDims - 1; ++i) { + rhs_strides[i + 1] = rhs_strides[i] * eval_right_dims[i]; + } + + if (m_i_strides.size() > 0) m_i_strides[0] = 1; + if (m_j_strides.size() > 0) m_j_strides[0] = 1; + if (m_k_strides.size() > 0) m_k_strides[0] = 1; + + m_i_size = 1; + m_j_size = 1; + m_k_size = 1; + + // To compute the dimension, we simply concatenate the non-contracting + // dimensions of the left and then the right tensor. Additionally, we also + // compute the strides corresponding to the left non-contracting + // dimensions and right non-contracting dimensions. + m_lhs_inner_dim_contiguous = true; + int dim_idx = 0; + Index nocontract_idx = 0; + + for (int i = 0; i < LDims; i++) { + // find if we are contracting on index i of left tensor + bool contracting = false; + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].first == i) { + contracting = true; + break; + } + } + if (!contracting) { + // add dimension size to output dimensions + m_dimensions[dim_idx] = eval_left_dims[i]; + m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; + if (dim_idx != i) { + m_lhs_inner_dim_contiguous = false; + } + if (nocontract_idx + 1 < internal::array_size::value) { + m_i_strides[nocontract_idx + 1] = m_i_strides[nocontract_idx] * eval_left_dims[i]; + } else { + m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; + } + dim_idx++; + nocontract_idx++; + } + } + + nocontract_idx = 0; + for (int i = 0; i < RDims; i++) { + bool contracting = false; + // find if we are contracting on index i of right tensor + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].second == i) { + contracting = true; + break; + } + } + if (!contracting) { + m_dimensions[dim_idx] = eval_right_dims[i]; + if (nocontract_idx + 1 < internal::array_size::value) { + m_j_strides[nocontract_idx + 1] = m_j_strides[nocontract_idx] * eval_right_dims[i]; + } else { + m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; + } + m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; + dim_idx++; + nocontract_idx++; + } + } + + // Now compute the strides corresponding to the contracting dimensions. We + // assumed above that non-contracting axes are represented in the same order + // in the matrix as they are in the tensor. This is not the case for + // contracting axes. As the contracting axes must be of the same size in + // each tensor, we'll only look at the first tensor here. + m_rhs_inner_dim_contiguous = true; + m_rhs_inner_dim_reordered = false; + for (int i = 0; i < ContractDims; i++) { + Index left = eval_op_indices[i].first; + Index right = eval_op_indices[i].second; + + Index size = eval_left_dims[left]; + eigen_assert(size == eval_right_dims[right] && "Contraction axes must be same size"); + + if (i + 1 < static_cast(internal::array_size::value)) { + m_k_strides[i + 1] = m_k_strides[i] * size; + } else { + m_k_size = m_k_strides[i] * size; + } + m_left_contracting_strides[i] = lhs_strides[left]; + m_right_contracting_strides[i] = rhs_strides[right]; + + if (i > 0 && right < eval_op_indices[i - 1].second) { + m_rhs_inner_dim_reordered = true; + } + if (right != i) { + m_rhs_inner_dim_contiguous = false; + } + } + + // If the layout is RowMajor, we need to reverse the m_dimensions + if (static_cast(Layout) == static_cast(RowMajor)) { + for (int i = 0, j = NumDims - 1; i < j; i++, j--) { + numext::swap(m_dimensions[i], m_dimensions[j]); + } + } + + // A set of parameters that will allow output kernel to get from output + // tensor dimensions (i, j) into the original tensor dimensions. + // TODO(ezhulenev): Add parameters required to infer output tensor index for + // more complex contractions than 2x2 on internal dimension. + m_tensor_contraction_params.swapped_arguments = static_cast(Layout) == RowMajor; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + if (dest) { + evalToAsync(dest, [done]() { done(false); }); + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalToAsync(m_result, [done]() { done(true); }); + } + }); + }); + } +#endif // EIGEN_USE_THREADS + +#ifndef TENSOR_CONTRACTION_DISPATCH +#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } +#endif + +#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH +#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } +#endif + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + static_cast(this)->template evalProduct(buffer); + } + +#ifdef EIGEN_USE_THREADS + template + void evalToAsync(Scalar* buffer, EvalToCallback done) const { + static_cast(this)->template evalProductAsync(buffer, std::move(done)); + } +#endif // EIGEN_USE_THREADS + + template + void evalProductSequential(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv( + buffer); + } else { + this->template evalGemm( + buffer); + } + } + + template +#if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC +#endif + void + evalGemv(Scalar* buffer) const { + const Index rows = m_i_size; + const Index cols = m_k_size; + + typedef std::remove_const_t LhsScalar; + typedef std::remove_const_t RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; + const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; + typedef internal::TensorContractionInputMapper + LhsMapper; + + typedef internal::TensorContractionInputMapper + RhsMapper; + + LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); + RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); + + const Scalar alpha(1); + const Index resIncr(1); + + // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) + m_device.fill(buffer, buffer + rows, Scalar(0)); + + internal::general_matrix_vector_product::run(rows, cols, lhs, rhs, buffer, resIncr, alpha); + + typedef internal::blas_data_mapper OutputMapper; + m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, static_cast(0), + static_cast(0), rows, static_cast(1)); + } + + template +#if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC +#endif + void + evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + this->template evalGemmPartial(buffer, 0, k, 1); + } + + template + EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(Scalar* buffer, Index k_start, Index k_end, + int num_threads) const { + evalGemmPartial(buffer, k_start, k_end, num_threads); + } + + template + EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size); + // columns in slice on left side, rows on right side + const Index k_slice = k_end - k_start; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // define data mappers for Lhs and Rhs + typedef std::remove_const_t LhsScalar; + typedef std::remove_const_t RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + + typedef internal::TensorContractionInputMapper + LhsMapper; + + typedef internal::TensorContractionInputMapper + RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + typedef internal::TensorContractionKernel + TensorContractionKernel; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // Sizes of the blocks to load in cache. See the Goto paper for details. + internal::TensorContractionBlocking blocking( + k_slice, m, n, num_threads); + const Index kc = blocking.kc(); + const Index mc = numext::mini(m, blocking.mc()); + const Index nc = numext::mini(n, blocking.nc()); + + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + + LhsBlock blockA; + RhsBlock blockB; + + TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc); + + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + const BlockMemHandle packed_mem = kernel.allocate(this->m_device, &blockA, &blockB); + + // If a contraction kernel does not support beta, explicitly initialize + // output buffer with zeroes. + if (!TensorContractionKernel::HasBeta) { + this->m_device.fill(buffer, buffer + m * n, Scalar(0)); + } + + for (Index i2 = 0; i2 < m; i2 += mc) { + const Index actual_mc = numext::mini(i2 + mc, m) - i2; + for (Index k2 = k_start; k2 < k_end; k2 += kc) { + // make sure we don't overshoot right edge of left matrix, then pack vertical panel + const Index actual_kc = numext::mini(k2 + kc, k_end) - k2; + kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); + + // If kernel supports beta, there is no need to initialize output + // buffer with zeroes. + const Scalar alpha = Scalar(1); + const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start) ? Scalar(0) : Scalar(1); + + // series of horizontal blocks + for (Index j2 = 0; j2 < n; j2 += nc) { + // make sure we don't overshoot right edge of right matrix, then pack block + const Index actual_nc = numext::mini(j2 + nc, n) - j2; + kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc); + + // call gebp (matrix kernel) + // The parameters here are copied from Eigen's GEMM implementation + const OutputMapper output_mapper = output.getSubMapper(i2, j2); + kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc, alpha, beta); + + // We are done with this [i2, j2] output block. + if (use_output_kernel && k2 + kc >= k_end) { + m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, actual_mc, actual_nc); + } + } + } + } + + kernel.deallocate(this->m_device, packed_mem); + } + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } + + protected: + Dimensions m_dimensions; + + contract_t m_k_strides; + contract_t m_left_contracting_strides; + contract_t m_right_contracting_strides; + + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + left_nocontract_t m_i_strides; + right_nocontract_t m_j_strides; + left_nocontract_t m_left_nocontract_strides; + right_nocontract_t m_right_nocontract_strides; + + Index m_i_size; + Index m_j_size; + Index m_k_size; + + TensorContractionParams m_tensor_contraction_params; + + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; + const Device EIGEN_DEVICE_REF m_device; + OutputKernelType m_output_kernel; + EvaluatorPointerType m_result; +}; + +// evaluator for default device +template +struct TensorEvaluator, Device> + : public TensorContractionEvaluatorBase< + TensorEvaluator, Device>> { + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef std::remove_const_t Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + static constexpr int Layout = TensorEvaluator::Layout; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef std::conditional_t(ColMajor), LeftArgType, RightArgType> EvalLeftArgType; + typedef std::conditional_t(ColMajor), RightArgType, LeftArgType> EvalRightArgType; + + static constexpr int LDims = + internal::array_size::Dimensions>::value; + static constexpr int RDims = + internal::array_size::Dimensions>::value; + static constexpr int ContractDims = internal::array_size::value; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static constexpr int NumDims = LDims + RDims - 2 * ContractDims; + + // Could we use NumDimensions here? + typedef DSizes Dimensions; + + TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + template + void evalProduct(Scalar* buffer) const { + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer)); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h new file mode 100644 index 00000000000..7fbe30a9f9f --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -0,0 +1,69 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +enum { ShardByRow = 0, ShardByCol = 1 }; + +// Default Blocking Strategy +template +class TensorContractionBlocking { + public: + /* + adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h` + requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h` + which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in + `GeneralBlockPanelKernel.h` which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in + `GeneralBlockPanelKernel.h` (else HIPCC will error out) + + However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` + results in NVCC erroring out with the following error + + ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901: + dynamic initialization is not supported for function-scope static variables within a __device__/__global__ + function + */ + +#if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC +#endif + TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) + : kc_(k), mc_(m), nc_(n) { + if (ShardingType == ShardByCol) { + computeProductBlockingSizes(kc_, mc_, nc_, num_threads); + } else { + computeProductBlockingSizes(kc_, nc_, mc_, num_threads); + } + + const int rhs_packet_size = internal::packet_traits::size; + kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? kc_ : (kc_ / rhs_packet_size) * rhs_packet_size; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } + + private: + StorageIndex kc_; + StorageIndex mc_; + StorageIndex nc_; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h new file mode 100644 index 00000000000..dbea8aa92af --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -0,0 +1,7 @@ + +#if defined(__clang__) || defined(__GNUC__) +#warning \ + "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file" +#endif + +#include "TensorContractionGpu.h" diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h new file mode 100644 index 00000000000..780e8961eae --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -0,0 +1,1387 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, + Scalar* rhs_shmem, const Index m_size, + const Index n_size, const Index k_size) { + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } + +#define writeRegToShmem() \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); + + internal::scalar_cast_op conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + +#undef prefetchIntoRegisters +#undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) +#else +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) +#endif + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res +} + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(512, 1) +#else +__launch_bounds__(512) +#endif + EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size, + const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + +template +__device__ __forceinline__ void EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i = 0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg = lhs.template loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg = lhs.template loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x = lhs(row + 0, col); \ + reg.y = lhs(row + 1, col); \ + reg.z = lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x = lhs(row + 0, col); \ + reg.y = lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x = lhs(row + 0, col); \ + } \ + } \ + } + + Index lhs_vert = base_m + threadIdx.x * 4; + + for (Index k = 0; k < k_size; k += 16) { + lhs_pf0 = internal::pset1(0); + rhs_pf0 = internal::pset1(0); + + Index lhs_horiz = threadIdx.y + k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k + (threadIdx.x % 4) * 4; + Index rhs_horiz0 = (threadIdx.x >> 2) + threadIdx.y * 4 + base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2; + // the following can be a bitwise operation..... some day. + if ((threadIdx.x % 8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } +#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); +#else + x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); + x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); +#endif + if ((threadIdx.x % 8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x >> 3) + threadIdx.y * 2][threadIdx.x % 8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x >> 3) + threadIdx.y * 2 + 32][threadIdx.x % 8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y + 16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + +#define add_vals(fl1, fl2, fr1, fr2) \ + results[0].x += fl1.x * fr1.x; \ + results[0].y += fl1.y * fr1.x; \ + results[0].z += fl2.x * fr1.x; \ + results[0].w += fl2.y * fr1.x; \ + \ + results[1].x += fl1.x * fr1.y; \ + results[1].y += fl1.y * fr1.y; \ + results[1].z += fl2.x * fr1.y; \ + results[1].w += fl2.y * fr1.y; \ + \ + results[2].x += fl1.x * fr2.x; \ + results[2].y += fl1.y * fr2.x; \ + results[2].z += fl2.x * fr2.x; \ + results[2].w += fl2.y * fr2.x; \ + \ + results[3].x += fl1.x * fr2.y; \ + results[3].y += fl1.y * fr2.y; \ + results[3].z += fl2.x * fr2.y; \ + results[3].w += fl2.y * fr2.y; + + __syncthreads(); + +// Do the multiplies. +#pragma unroll + for (int koff = 0; koff < 16; koff++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature >> 1) + 32 * ((koff % 4) / 2)][koff / 4 + (koff % 2) * 4]; + float2 fr2 = rhs_shmem2[(start_feature >> 1) + 1 + 32 * ((koff % 4) / 2)][koff / 4 + (koff % 2) * 4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y * 4 + base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + +template +__device__ __forceinline__ void EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i = 0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + Index lhs_vert = base_m + threadIdx.x * 4 + (threadIdx.y % 4) * 32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y / 4 + k + 24) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + lhs_pf2 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 16)); + lhs_pf3 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 24)); + } else if ((threadIdx.y / 4 + k + 16) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + lhs_pf2 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 16)); + } else if ((threadIdx.y / 4 + k + 8) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + } else if ((threadIdx.y / 4 + k) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y / 4 + k + 24) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + lhs_pf2 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 16)); + lhs_pf3 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 24)); + } else if ((threadIdx.y / 4 + k + 16) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + lhs_pf2 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 16)); + } else if ((threadIdx.y / 4 + k + 8) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + } else if ((threadIdx.y / 4 + k) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y / 4 + k + 24) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16)); + lhs_pf2.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 16)); + lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24)); + lhs_pf3.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 24)); + lhs_pf3.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 24)); + } else if ((threadIdx.y / 4 + k + 16) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16)); + lhs_pf2.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 16)); + } else if ((threadIdx.y / 4 + k + 8) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8)); + } else if ((threadIdx.y / 4 + k) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y / 4 + k + 24) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16)); + lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24)); + lhs_pf3.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 24)); + } else if ((threadIdx.y / 4 + k + 16) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16)); + } else if ((threadIdx.y / 4 + k + 8) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + } else if ((threadIdx.y / 4 + k) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y / 4 + k + 24) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24)); + } else if ((threadIdx.y / 4 + k + 16) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + } else if ((threadIdx.y / 4 + k + 8) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + } else if ((threadIdx.y / 4 + k) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + } + } + } + __syncthreads(); + Index rhs_vert = k + threadIdx.x * 4; + Index rhs_horiz0 = threadIdx.y * 2 + base_n; + Index rhs_horiz1 = threadIdx.y * 2 + 1 + base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k + threadIdx.x * 4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k + threadIdx.x * 4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y + 32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y + 64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y + 96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4) \ + results[0].x += a_feat1.x * f1.x; \ + results[1].x += a_feat1.x * f1.y; \ + results[2].x += a_feat1.x * f2.x; \ + results[3].x += a_feat1.x * f2.y; \ + results[4].x += a_feat1.x * f3.x; \ + results[5].x += a_feat1.x * f3.y; \ + results[6].x += a_feat1.x * f4.x; \ + results[7].x += a_feat1.x * f4.y; \ + \ + results[0].y += a_feat1.y * f1.x; \ + results[1].y += a_feat1.y * f1.y; \ + results[2].y += a_feat1.y * f2.x; \ + results[3].y += a_feat1.y * f2.y; \ + results[4].y += a_feat1.y * f3.x; \ + results[5].y += a_feat1.y * f3.y; \ + results[6].y += a_feat1.y * f4.x; \ + results[7].y += a_feat1.y * f4.y; \ + \ + results[0].z += a_feat2.x * f1.x; \ + results[1].z += a_feat2.x * f1.y; \ + results[2].z += a_feat2.x * f2.x; \ + results[3].z += a_feat2.x * f2.y; \ + results[4].z += a_feat2.x * f3.x; \ + results[5].z += a_feat2.x * f3.y; \ + results[6].z += a_feat2.x * f4.x; \ + results[7].z += a_feat2.x * f4.y; \ + \ + results[0].w += a_feat2.y * f1.x; \ + results[1].w += a_feat2.y * f1.y; \ + results[2].w += a_feat2.y * f2.x; \ + results[3].w += a_feat2.y * f2.y; \ + results[4].w += a_feat2.y * f3.x; \ + results[5].w += a_feat2.y * f3.y; \ + results[6].w += a_feat2.y * f4.x; \ + results[7].w += a_feat2.y * f4.y; + + lhs_shmem2[threadIdx.y / 4][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y / 4 + 8][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y / 4 + 16][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y / 4 + 24][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y / 4 + 32][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y / 4 + 40][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y / 4 + 48][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y / 4 + 56][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + +// Do the multiplies. +#pragma unroll + for (int koff = 0; koff < 32; koff++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature / 2 + (koff % 4) * 32][koff / 4]; + float2 br2 = rhs_shmem2[start_feature / 2 + 1 + (koff % 4) * 32][koff / 4]; + float2 br3 = rhs_shmem2[start_feature / 2 + 2 + (koff % 4) * 32][koff / 4]; + float2 br4 = rhs_shmem2[start_feature / 2 + 3 + (koff % 4) * 32][koff / 4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + +#undef add_vals + + __syncthreads(); + Index horiz_base = (threadIdx.y / 4) * 8 + base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else +__launch_bounds__(256) +#endif + EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size, + const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64 * 32]; + __shared__ float2 rhs_shmem[128 * 8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else +__launch_bounds__(256) +#endif + EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +struct TensorEvaluator, GpuDevice> + : public TensorContractionEvaluatorBase, GpuDevice> > { + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef std::remove_const_t Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + static constexpr int Layout = TensorEvaluator::Layout; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef std::conditional_t(ColMajor), LeftArgType, RightArgType> EvalLeftArgType; + typedef std::conditional_t(ColMajor), RightArgType, LeftArgType> EvalRightArgType; + + static constexpr int LDims = + internal::array_size::Dimensions>::value; + static constexpr int RDims = + internal::array_size::Dimensions>::value; + static constexpr int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static constexpr int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef std::remove_const_t LhsScalar; + typedef std::remove_const_t RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { + EIGEN_STATIC_ASSERT((internal::is_same::value), + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS); + } + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } + } + + template + struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, + const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_GPU_KERNEL((EigenContractionKernel), num_blocks, + block_size, 0, device, lhs, rhs, output, m, n, k); + } + }; + + template + struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, + const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16), num_blocks, + block_size, 0, device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel), num_blocks, + block_size, 0, device, lhs, rhs, output, m, n, k); + } + } + }; + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)) + this->m_device.fill(buffer, buffer + m * n, Scalar(0)); + + typedef internal::TensorContractionInputMapper + LhsMapper; + + typedef internal::TensorContractionInputMapper + RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + +#if defined(EIGEN_USE_HIP) + setGpuSharedMemConfig(hipSharedMemBankSizeEightByte); +#else + setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte); +#endif + + LaunchKernels::Run(lhs, rhs, output, m, n, k, + this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and EIGEN_GPUCC +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h new file mode 100644 index 00000000000..6367db96409 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -0,0 +1,533 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +enum { Rhs = 0, Lhs = 1 }; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ +/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the +/// default make pointer is used which is scalar * for CoeffLoader. +template class MakePointer_ = MakePointer> +struct CoeffLoader; + +template class MakePointer_ = MakePointer> +class BaseTensorContractionMapper; + +template class MakePointer_> +struct CoeffLoader { + enum { DirectOffsets = false }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { + eigen_assert(false && "unsupported"); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type data() const { + eigen_assert(false && "unsupported"); + return NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { + return m_tensor.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Tensor::PacketReturnType packet(typename Tensor::Index index) const { + return m_tensor.template packet(index); + } + + private: + const Tensor m_tensor; +}; + +template class MakePointer_> +struct CoeffLoader { + enum { DirectOffsets = true }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { m_data += offset; } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type data() const { + return m_data; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { + return loadConstant(m_data + index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Tensor::PacketReturnType packet(typename Tensor::Index index) const { + return internal::ploadt_ro(m_data + index); + } + + private: + typedef typename Tensor::Scalar Scalar; + + typename MakePointer_::Type m_data; +}; + +template class MakePointer_ = MakePointer> +class SimpleTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC SimpleTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, const contract_t& contract_strides, + const contract_t& k_strides) + : m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) {} + + enum { DirectOffsets = CoeffLoader::DirectOffsets }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_tensor.offsetBuffer(offset); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void prefetch(Index /*i*/) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 + Index nocontract_val = left ? row : col; + Index linidx = 0; + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + if (array_size::value > 0) { + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + } + + return linidx; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, + const Index distance) const { + const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + if (array_size::value > array_size::value) { + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + if (array_size::value > 0) { + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + } + return IndexPair(linidx[0], linidx[1]); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { + // Only claim alignment when we can compute the actual stride (ie when we're + // dealing with the lhs with inner_dim_contiguous. This is because the + // matrix-vector product relies on the stride when dealing with aligned inputs. + return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { + return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; + } + + const CoeffLoader& tensor() const { return m_tensor; } + + const nocontract_t& nocontract_strides() const { return m_nocontract_strides; } + const nocontract_t& ij_strides() const { return m_ij_strides; } + const contract_t& contract_strides() const { return m_contract_strides; } + const contract_t& k_strides() const { return m_k_strides; } + + protected: + CoeffLoader m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + +template class MakePointer_> +class BaseTensorContractionMapper + : public SimpleTensorContractionMapper { + public: + typedef SimpleTensorContractionMapper + ParentMapper; + + EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, const contract_t& contract_strides, + const contract_t& k_strides) + : ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + std::enable_if_t::size == packet_size, PacketT> + load(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i + packet_size - 1, j) == index + packet_size - 1); + return this->m_tensor.template packet(index); + } + + const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && + (lastIdx - first) == (packet_size - 1)) { + return this->m_tensor.template packet(first); + } + + EIGEN_ALIGN_MAX Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + EIGEN_UNROLL_LOOP + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + std::enable_if_t::size != packet_size, PacketT> + load(Index i, Index j) const { + const Index requested_packet_size = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar data[requested_packet_size]; + + const IndexPair indexPair = this->computeIndexPair(i, j, requested_packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < requested_packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + return this->load(i, j); + } +}; + +template class MakePointer_> +class BaseTensorContractionMapper + : public SimpleTensorContractionMapper { + public: + typedef SimpleTensorContractionMapper + ParentMapper; + + EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, const contract_t& contract_strides, + const contract_t& k_strides) + : ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload(data); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload(data); + } +}; + +template class MakePointer_ = MakePointer> +class TensorContractionSubMapper { + public: + typedef BaseTensorContractionMapper + ParentMapper; + typedef TensorContractionSubMapper + Self; + typedef Self LinearMapper; + typedef Self SubMapper; + + enum { + // We can use direct offsets iff the parent mapper supports then and we can compute the strides. + // TODO: we should also enable direct offsets for the Rhs case. + UseDirectOffsets = + ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size::value > 0) + }; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { + // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute + // this offset every time we attempt to access a coefficient. + if (UseDirectOffsets) { + Index stride = m_base_mapper.stride(); + m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); + } + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper(i, 0); + } + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper(i, j); + } + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacketPartial(Index i, Index j, Index, Index = 0) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper.template load(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const { + if (UseDirectOffsets) { + m_base_mapper.storePacket(i, 0, p); + } + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + if (UseDirectOffsets) { + return LinearMapper(m_base_mapper, i, j); + } + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const { + if (UseDirectOffsets) { + return SubMapper(m_base_mapper, i, j); + } + return SubMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const Index stride() const { return m_base_mapper.stride(); } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { + return false; + } + + const ParentMapper& base_mapper() const { return m_base_mapper; } + Index vert_offset() const { return m_vert_offset; } + Index horiz_offset() const { return m_horiz_offset; } + + private: + ParentMapper m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + +template class MakePointer_ = MakePointer> +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + public: + typedef Scalar_ Scalar; + typedef BaseTensorContractionMapper + Base; + typedef TensorContractionSubMapper + SubMapper; + typedef SubMapper VectorMapper; + typedef SubMapper LinearMapper; + + EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader& get_tensor() const { + return Base::m_tensor; + } +}; + +template +struct TensorContractionInputMapperTrait; + +template class MakePointer_> +struct TensorContractionInputMapperTrait< + TensorContractionInputMapper > { + typedef Tensor_ XprType; + static const bool inner_dim_contiguous = inner_dim_contiguous_; + static const bool inner_dim_reordered = inner_dim_reordered_; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h new file mode 100644 index 00000000000..c7203a9306e --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -0,0 +1,1654 @@ +// This file is part of Eigen, a lightweight C++ template library for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not +// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorContractionSycl.h + * + * \brief: + * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend + * + *****************************************************************/ + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_DISABLE_GEMV +/*! + * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector + * contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor/vector + * + * \tparam StorageIndex determines the Index type. + * + * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group + * + * \tparam CFactor: determines the number of contracting element to be process by each thread + * + * \tparam NCFactor: determines the number of non-contracting element to be process by each thread + */ +template +struct TVPanelSize { + // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimNC: determines the tile size for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor; + // TileSizeDimC: determines the tile size for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; + // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; + // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = false; +}; +#endif + +/*! + * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor + contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor + * + * \tparam StorageIndex: determines the Index type. + * + * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro). + * + * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro). + * + * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered + */ + +template +struct TTPanelSize { + // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered + static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK; + // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the + // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro// +#ifndef EIGEN_SYCL_REG_M + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; +#endif +// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the +// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro +#ifndef EIGEN_SYCL_REG_N + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; +#endif + // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimM: determines the tile size for the m dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; + // TileSizeDimN: determines the tile size for the n dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; + // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisible by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN)); + // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisible by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM)); + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = true; + // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by + // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory) + static EIGEN_CONSTEXPR bool DoubleBuffer = +#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER + false; +#else + true; +#endif +}; + +/* ! + * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to + * specialize the contraction algorithm based on device support for dedicated local memory. + */ +enum class contraction_type { local, no_local }; +/* ! + * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private). + */ +enum class data_source { global_mem, local_mem, private_mem }; + +/*! + * \brief read, a template function used for loading the data from global + memory. This function is used to guarantee coalesced and vectorized load whenever possible + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex is the contracting dim index + * + * \param ld: is the leading dimension of the flattened tensor + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) { + const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex; + const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex; + return tensorMapper.get_tensor().template packet(row + (col * ld)); +} + +/*! + * \brief read, special overload of read function, when the read access is not vectorized + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex: is the contracting dim index + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) { + const StorageIndex row = (IsRhs) ? CIndex : NCIndex; + const StorageIndex col = (IsRhs) ? NCIndex : CIndex; + return tensorMapper(row, col); +} + +/*! + * \brief write, a template function used for storing the data to local memory. This function is used to guarantee + * coalesced and vectorized store whenever possible. + * + * \tparam StorageIndex: determines the Index type + * + * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + * + * \param CIndex is the contracting dim index + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t
write( + PacketType &packet_data, DataScalar ptr) { + EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits::size; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; i++) { + *ptr = PacketWrapper::scalarize(i, packet_data); + ptr += ld; + } +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function + * is used to guarantee coalesced and vectorized store whenever possible. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename std::enable_if_t::size != 1 && dt == data_source::global_mem, + void> + write(PacketType &packet_data, DataScalar *ptr) { + ::Eigen::internal::pstoreu(ptr, packet_data); +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename std::enable_if_t::size == 1 && dt == data_source::global_mem, + void> + write(PacketType &packet_data, DataScalar *ptr) { + *ptr = packet_data; +} + +/*! + * \brief check_boundary: is used to check the edge condition for non-internal blocks. + * + * \tparam is_internal: determines if the block is internal + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) { + return true; +} + +/*! + * \brief check_boundary: specialization of the check_boundary for non-internal blocks. + * + * \param cond: true when the data is in range. Otherwise false + */ +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool cond) { + return cond; +} + +/*! + * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed + * by each workgroup. + * + * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed + * + * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode + * + * \tparam PacketType: determines the type of packet + * + * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be + * packetType; Otherwise it will be scalar Type + * + * \param elements_per_access determines the size of each element based on OutType + * + * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and + * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + * + * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + */ +template +struct BlockProperties { + static EIGEN_CONSTEXPR bool packet_load = packet_load_; + typedef typename Eigen::internal::unpacket_traits::type OutScalar; + static EIGEN_CONSTEXPR bool is_rhs = is_rhs_; + typedef std::conditional_t OutType; + static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits::size; + static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs); + static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1); + static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access); +}; + +/*! + * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see + * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup, + * work-items + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \param linearLocalThreadId: determines the linearized location of a thread within a work-group + * + * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when + * tall/skinny algorithm is used + * + * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of + * the flattened tensor. + * + * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the + * flattened tensor. It will be > 1 when tall/skinny algorithm is used. + * + * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a + * flattened tensor + * + * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a + * flattened tensor + * + * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the + * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used. + * + * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or + * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be + * resolve by compiler. + */ +template +struct ThreadProperties { + const StorageIndex linearLocalThreadId; + const StorageIndex kGroupId; + const StorageIndex mGroupOffset; + const StorageIndex nGroupOffset; + const StorageIndex kGroupOffset; + const StorageIndex mLocalOffset; + const StorageIndex nLocalOffset; + const StorageIndex mGlobalOffset; + const StorageIndex nGlobalOffset; + StorageIndex kSize; + const bool is_internal; + // this is used to adjust the last block + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties( + const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, + const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, + const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, + StorageIndex kSize_, const bool is_internal_) + : linearLocalThreadId(linearLocalThreadId_), + kGroupId(kGroupId_), + mGroupOffset(mGroupOffset_), + nGroupOffset(nGroupOffset_), + kGroupOffset(kGroupOffset_), + mLocalOffset(mLocalOffset_), + nLocalOffset(nLocalOffset_), + mGlobalOffset(mGlobalOffset_), + nGlobalOffset(nGlobalOffset_), + kSize(kSize_), + is_internal(is_internal_) {} +}; + +/*! + * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory + access is used to guarantee that always the memory access are coalesced. + * + * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output. + Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny + contraction is used. So in this case, a final reduction step is required to compute final output. + + * \tparam contraction_tp: it is an enum value representing whether the local memory/no local memory implementation of + the algorithm to be used + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param groupSizeM: a logical number determining the number of work-group for m dimension + * + * \param groupSizeN: a logical number determining the number of work-group for n dimension + * + * \param numTiles: determines total number of tiles on the k dimension + * + * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + */ +template +class TensorContractionKernel { + public: + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + static EIGEN_CONSTEXPR bool is_lhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + + typedef BlockProperties + LHSBlockProperties; + + typedef BlockProperties + RHSBlockProperties; + + static EIGEN_CONSTEXPR StorageIndex NStride = + contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride; + + typedef cl::sycl::accessor Scratch; + typedef cl::sycl::multi_ptr local_ptr; + typedef OutScalar * /*cl::sycl::multi_ptr*/ private_ptr; + typedef std::conditional_t tile_ptr; + static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local + ? Properties::TileSizeDimM + Properties::BC + : Properties::WorkLoadPerThreadM; + static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local + ? Properties::TileSizeDimN + Properties::BC + : Properties::WorkLoadPerThreadN; + static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + + /** + * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not + * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to + * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting + * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out + * different type of memory needed when local/no_local memory computation is called. + * + * \tparam contraction_type: it is an enum value representing whether the local memory/no local memory implementation + of the algorithm to be used + * \tparam the private memory size + * \param ptr the tile memory pointer type + */ + template + struct MemHolder { + tile_ptr ptr; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {} + }; + /** + * \brief specialization of memHolder class when no local memory kernel is used. + */ + template + struct MemHolder { + OutScalar ptr[MemSize] = {OutScalar{0}}; + }; + /** + * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from + * global memory to local/private memory when local/no_local algorithm used. + * + * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param lhs_extract_index: determines the position of each thread on a local memory for lhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param rhs_extract_index: determines the position of each thread on a local memory for rhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the + * same as lhs_scratch_extract for private memory. + * + * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the + * same as rhs_scratch_extract for private memory. + */ + struct TiledMemory { + MemHolder lhs_scratch_extract; + MemHolder rhs_scratch_extract; + tile_ptr lhs_scratch_ptr_compute; + tile_ptr rhs_scratch_ptr_compute; + const std::pair lhs_extract_index; + const std::pair rhs_extract_index; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties &, local_ptr, + std::enable_if_t * = 0) + : lhs_scratch_extract{}, + rhs_scratch_extract{}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr), + lhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})), + rhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties &thread_properties, + local_ptr block_start_ptr, + std::enable_if_t * = 0) + : lhs_scratch_extract{block_start_ptr}, + rhs_scratch_extract{lhs_scratch_extract.ptr + + ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset), + lhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)), + rhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)) {} + }; + + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex groupSizeM; + const StorageIndex groupSizeN; + const StorageIndex numTiles; + const TripleDim triple_dim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex groupSizeN_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : scratch(scratch_), + lhs(lhs_), + rhs(rhs_), + out_res(out_res_), + groupSizeM(groupSizeM_), + groupSizeN(groupSizeN_), + numTiles(numTiles_), + triple_dim(triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM; + const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM; + const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM; + const StorageIndex tmp = itemID.get_group(0) / groupSizeM; + const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN; + const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN; + const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM; + const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN; + const StorageIndex mLocalOffset = PacketSize * mLocalThreadId; + const StorageIndex nLocalOffset = NStride * nLocalThreadId; + const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset; + const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset; + + const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK; + StorageIndex kGroupOffset = kGroupId * kSizePerWG; + const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM && + triple_dim.N - nGroupOffset >= Properties::TileSizeDimN && + triple_dim.K - kGroupOffset >= kSizePerWG; + // this is used to adjust the last block + StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset); + // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to + // tile + kGroupOffset += kSize; + + auto thread_properties = + ThreadProperties(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset, + mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal); + + auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); + + (thread_properties.is_internal) ? compute_panel(itemID, thread_properties, out_ptr) + : compute_panel(itemID, thread_properties, out_ptr); + } + // The compute block computes the contraction operation private block for each thread and store the resutl in the + // privateRes memory of Each computation the compute block function is independent of local and no local concepts as + // it only compute the block on each thread's private memory space + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, + PacketReturnType *privateRes) const { + StorageIndex idx = 0; + EIGEN_CONSTEXPR StorageIndex lhs_stride = + contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) { + auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)}; + StorageIndex lhs_index = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType lhsPack{}; + Eigen::TensorSycl::internal::PacketWrapper::set_packet(lhsPack, + lhs_block_ptr + lhs_index); + privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]); + + lhs_index += lhs_stride; + idx++; + } + } + } + // The store function write the computed contraction operation in the private memory of each thread to the global + // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base + // class. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, + StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const { + auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC { + return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N); + }; + // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is + // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId* + // WorkLoadPerThreadN slice of N + EIGEN_CONSTEXPR StorageIndex GlobalNStride = + contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) { + // output leading dimension + StorageIndex outputLD = 0; + // When local memory is used the PrivateNstride is always 1 because the coalesced access on N is loaded into Local + // memory and extracting from local to global is the same as no transposed version. However, when local memory is + // not used and RHS is transposed we packetize the load for RHS. + EIGEN_UNROLL_LOOP + for (StorageIndex nId = 0; nId < PrivateNStride; nId++) { + StorageIndex globalRow = mGlobalOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType privetOut = privateRes[wLPTM]; + if (check_boundary(chk_bound(globalRow, nId))) { + // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second + // StorageIndex Therefore it is always coalesced layout + write(privetOut, out_ptr + outputLD + globalRow); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex mId = 0; mId < PacketSize; mId++) { + StorageIndex mOffset = globalRow + mId; + if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) { + out_ptr[mOffset + outputLD] = + Eigen::TensorSycl::internal::PacketWrapper::scalarize(mId, privetOut); + } + } + } + globalRow += (PacketSize * Properties::LocalThreadSizeM); + } + outputLD += triple_dim.M; + privateRes += Properties::WorkLoadPerThreadM / PacketSize; + } + out_ptr += (GlobalNStride * outputLD); + + nGlobalOffset += (PrivateNStride * GlobalNStride); + } + } + // when no local memory is used the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t extract_block( + const Input &inpt, PrivateReg private_ptr, const std::pair &, + const StorageIndex &ncOffset, const StorageIndex cOffset) const { + EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = + InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; + EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = + InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM; + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + StorageIndex cIndex = cOffset; + + EIGEN_UNROLL_LOOP + for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) { + StorageIndex ncIndex = ncOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) { + if (check_boundary(chk_bound(cIndex, ncIndex))) { + auto val = + read(inpt, ncIndex, cIndex, ld); + + write(val, private_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (ncInd < NC && cInd < triple_dim.K) + ? read( + inpt, ncInd, cInd, ld) + : OutScalar(0); + write( + val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC)); + } + } + + // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So + // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread. + ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1) + ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC + : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC); + private_ptr += InputBlockProperties::nc_stride; + } + // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC + private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC; + cIndex += InputBlockProperties::c_stride; + } + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair local_id_extract( + const StorageIndex &linearLocalThreadId) { + const StorageIndex localThreadNC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + const StorageIndex localThreadC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + return std::pair(localThreadNC, localThreadC); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t sync_mem( + const cl::sycl::nd_item<1> &, bool &db_offset) noexcept { + db_offset = !db_offset; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t sync_mem( + const cl::sycl::nd_item<1> &itemID, bool &) noexcept { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t sync_mem( + const cl::sycl::nd_item<1> &, bool &) noexcept { + return; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t + sync_thread(const cl::sycl::nd_item<1> & +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID +#endif + ) noexcept { +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID.barrier(cl::sycl::access::fence_spacce::local_space); +#else + return; +#endif + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t + sync_thread(const cl::sycl::nd_item<1> &itemID) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t sync_thread(const cl::sycl::nd_item<1> &) { + return; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + TiledMemory &tiled_input_block, + PacketReturnType *privateRes, + bool &db_offset) const { + // Tiling the Rhs block from global to local memory + extract_block( + rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR), + tiled_input_block.rhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + sync_thread(itemID); + + // Tiling the Lhs block from global to local memory + extract_block( + lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK), + tiled_input_block.lhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + // itemID.barrier(cl::sycl::access::fence_space::local_space); + sync_thread(itemID); + // switch to compute mede + StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK); + StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR); + // Loop over the values of a single tile + for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) { + compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset, + tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes); + lhs_offset += LSDL; + rhs_offset += LSDR; + } + // computing the K index for the next tile + thread_properties.kSize -= Properties::TileSizeDimK; + sync_mem(itemID, db_offset); + } + + // when local memory is available the following compute_panel will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + OutPtr out_ptr) const { + auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()}; + // Allocate register space + PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = { + PacketReturnType{0}}; + bool db_offset = 0; + + while (thread_properties.kSize >= Properties::TileSizeDimK) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + if (thread_properties.kSize > 0) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + + // Storing the final results in the output + store(1) : RHSBlockProperties::nc_stride>( + out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset, + thread_properties.nGlobalOffset); + } + // When local memory is available the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t extract_block( + const Input &inpt, Local local_ptr, const std::pair &local_index, + const StorageIndex &ncOffset, const StorageIndex cOffset) const { + EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = + InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; + EIGEN_CONSTEXPR StorageIndex LoadPerThread = + InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs; + EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; + static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) && + (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)), + " LocalOffset must be divisible by stride"); + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + StorageIndex localThreadNC = local_index.first; + StorageIndex localThreadC = local_index.second; + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + EIGEN_UNROLL_LOOP + for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) { + const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC); + const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC); + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + if (check_boundary(chk_bound(CIndex, NCIndex))) { + auto val = + read(inpt, NCIndex, CIndex, ld); + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::c_stride * localThreadC * LSD)); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (nCInd < NC && cInd < triple_dim.K) + ? read( + inpt, nCInd, cInd, ld) + : OutScalar(0); + + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::c_stride * localThreadC + + (InputBlockProperties::is_coalesced_layout ? 0 : i)) * + LSD)); + } + } + localThreadNC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + localThreadC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + } + } +}; + +#ifndef EIGEN_SYCL_DISABLE_GEMV + +/*! + * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special + * case of Tensor Tensor contraction. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs) + * + * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs) + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam KFactor: determines the number of elements in K dimension in a Tile + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector + * + * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output. + * Otherwise, the result of contraction will be written iin a temporary buffer. + * + * \param scratch: determines the local memory containing the vector block for each work-group + * + * \param vec: determines the vector input (tensor mapper) + * + * \param mat: determines the tensor input (tensor mapper) + * + * \param out_res: determines the output vector containing the contraction result + * + * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension + * + * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor + * + * \param contractDim: determines the size of non contracting dimension for the flattened tensor + * + */ +template +struct GeneralVectorTensor { + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + typedef cl::sycl::accessor Scratch; + + static EIGEN_CONSTEXPR StorageIndex OutScratchOffset = + KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + + // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make + // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true. + typedef BlockProperties + VecBlockProperties; + + Scratch scratch; + const VectorMapper vec; + const TensorMapper mat; + OutAccessor out_res; + const StorageIndex nonContractGroupSize; + const StorageIndex nonContractDim; + const StorageIndex contractDim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, + const TensorMapper mat_, OutAccessor out_res_, + const StorageIndex nonContractGroupSize_, + const StorageIndex nonContractDim_, + const StorageIndex contractDim_) + : scratch(scratch_), + vec(vec_), + mat(mat_), + out_res(out_res_), + nonContractGroupSize(nonContractGroupSize_), + nonContractDim(nonContractDim_), + contractDim(contractDim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + auto scratch_ptr = scratch.get_pointer(); + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC + : linearLocalThreadId % Properties::LocalThreadSizeNC; + StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC + : linearLocalThreadId / Properties::LocalThreadSizeNC; + const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize; + const StorageIndex nonContractGroupId = + is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize; + const StorageIndex contractGroupId = + is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize; + auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim); + + const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC; + const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC; + auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId; + const StorageIndex globalContractDimOffset = contractGroupOffset + contractId; + auto local_output = scratch_ptr + OutScratchOffset; + const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC && + contractDim - contractGroupOffset >= Properties::TileSizeDimC; + is_internal + ? compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex) + : compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel( + const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, + OutPtr out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + OutScalar *scratch_ptr, const StorageIndex contractGroupOffset, +#endif + const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, + StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, + StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) { + OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)}; + // Reading the vector +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId; + extract_block(vec, scratch_ptr, linearLocalThreadId, + vectorOffset, contractDim); + + itemID.barrier(cl::sycl::access::fence_space::local_space); + auto in_scratch_ptr = scratch_ptr + contractId; +#endif + + StorageIndex privateOffsetC = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) { + StorageIndex privateOffsetNC = 0; + bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim); +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + auto vecScalar = *in_scratch_ptr; +#else + auto vecScalar = (check_boundary(contract_conds)) + ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC, + is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0)) + : OutScalar(0); +#endif + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + auto matScalar = (check_boundary( + contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim))) + ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC + : globalNonContractDimOffset + privateOffsetNC, + is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC + : globalContractDimOffset + privateOffsetC) + : OutScalar(0); + + outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]); + privateOffsetNC += Properties::LocalThreadSizeNC; + } + privateOffsetC += Properties::LocalThreadSizeC; +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + in_scratch_ptr += Properties::LocalThreadSizeC; +#endif + } + + auto out_scratch_ptr = local_output + outScratchIndex; + // Each block of 16*16 element in shared memory should reduce to 16*1 + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + *out_scratch_ptr = outScalar[j]; + + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + if (is_lhs_vec) { + nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC; + contractId = linearLocalThreadId / Properties::LocalThreadSizeNC; + outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + } + + out_scratch_ptr = local_output + outScratchIndex; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + EIGEN_UNROLL_LOOP + for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (contractId < offset) { + StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset); + *out_scratch_ptr += out_scratch_ptr[myNeigbourId]; + } + } + // moving to next 16 by 16 block + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + + if (contractId == 0) { + out_scratch_ptr = local_output + nonContractId; + StorageIndex global_final_offset = nonContractGroupOffset + nonContractId; + out_ptr += global_final_offset; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + if (check_boundary(global_final_offset < nonContractDim)) { + auto res = *out_scratch_ptr; + + *out_ptr = res; + out_ptr += Properties::LocalThreadSizeNC; + } + // moving to next 16 by 16 block to ge the next 16 reduced elements + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC; + } + } + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, + const StorageIndex &linearLocalThreadId, + const StorageIndex &cOffset, const StorageIndex &C) { + local_ptr += InputBlockProperties::c_stride * linearLocalThreadId; + StorageIndex cIndex = cOffset; + for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) { + if (check_boundary(cIndex + InputBlockProperties::c_stride - 1 < C)) { + auto val = read(inpt, StorageIndex(0), + cIndex, StorageIndex(1)); + write(val, local_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + OutScalar val = + (cIndex + i < C) + ? read( + inpt, StorageIndex(0), cIndex + i, StorageIndex(1)) + : OutScalar(0); + write(val, local_ptr + i); + } + } + local_ptr += InputBlockProperties::c_stride * GroupSize; + cIndex += InputBlockProperties::c_stride * GroupSize; + } + } +}; +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + +/*! + * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction + * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param rng: determines the total input data size + */ +template +struct GeneralScalarContraction { + typedef cl::sycl::accessor Scratch; + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex rng; + + EIGEN_DEVICE_FUNC GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, + OutAccessor out_res_, const StorageIndex rng_) + : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {} + + EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const { + auto out_ptr = out_res; + OutScalar *scratch_ptr = scratch.get_pointer(); + + StorageIndex globalid = itemID.get_global_id(0); + StorageIndex localid = itemID.get_local_id(0); + OutScalar accumulator = OutScalar(0); + for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { + accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator); + } + auto out_scratch_ptr = scratch_ptr + localid; + *out_scratch_ptr = accumulator; + for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]); + } + } + if (localid == 0) { + out_ptr[itemID.get_group(0)] = accumulator; + } + } +}; +#endif + +} // namespace internal +} // namespace TensorSycl + +template +struct TensorEvaluator, + Eigen::SyclDevice> + : public TensorContractionEvaluatorBase, Eigen::SyclDevice>> { + static_assert(std::is_same::value, + "SYCL tensor contraction does not support output kernels."); + + typedef Eigen::SyclDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + typedef TensorContractionOp XprType; + typedef std::remove_const_t Scalar; + typedef typename XprType::Index StorageIndex; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Base::Storage Storage; + typedef typename Base::EvaluatorPointerType EvaluatorPointerType; + struct TripleDim { + const StorageIndex M; + const StorageIndex N; + const StorageIndex K; + TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {} + }; + enum { + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + }; + + static constexpr int Layout = TensorEvaluator::Layout; + static constexpr int LDims = Base::LDims; + static constexpr int RDims = Base::RDims; + static constexpr int ContractDims = Base::ContractDims; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static constexpr int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + typedef std::remove_const_t LhsScalar; + typedef std::remove_const_t RhsScalar; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + template + struct input_mapper_propertis { + static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_matrix = + (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered); + }; + + TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (!data) { + this->m_result = this->m_device.get( + static_cast(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar)))); + data = this->m_result; + } + evalToSycl(data); + return (this->m_result != NULL); + } + const Eigen::SyclDevice &device() const { return this->m_device; } + void evalToSycl(typename Base::EvaluatorPointerType buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } + } + + template + void evalTyped(typename Base::EvaluatorPointerType buffer) const { + const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size}; + typedef internal::TensorContractionInputMapper< + LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, + PacketType::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer> + LhsMapper; + + typedef internal::TensorContractionInputMapper::size, rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned, MakePointer> + RhsMapper; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + if (triple_dim.M == 1 && triple_dim.N == 1) { + launchSC(buffer, lhs, rhs, triple_dim.K); + } else +#endif +#ifndef EIGEN_SYCL_DISABLE_GEMV + if (triple_dim.M != 1 && triple_dim.N == 1) { + LaunchVT(buffer, rhs, lhs, triple_dim.M, triple_dim.K); + } else if (triple_dim.M == 1 && triple_dim.N != 1) { + LaunchVT(buffer, lhs, rhs, triple_dim.N, triple_dim.K); + } else // This is equivalent of if (m!=1 && n!=1) +#endif + { + typedef input_mapper_propertis + inpt_mapper_properties; +#ifndef EIGEN_SYCL_DISABLE_SKINNY + bool skinny = false; + auto platform_name = this->device().getPlatformName(); + // This is based on empirical calculation for AMD r9-nano and Fiji + if (platform_name.find("AMD") == 0) { + skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) && + ((triple_dim.M < 1024 && triple_dim.N < 1024) || + (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K))); + } else { + skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) || + ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) || + ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100)); + } + if (skinny) + adjustTT(buffer, lhs, rhs, triple_dim); + else +#endif // EIGEN_SYCL_DISABLE_SKINNY + adjustTT(buffer, lhs, rhs, triple_dim); + } + } + + template + void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + if (device().has_local_memory()) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF + if (!(device().has_local_memory())) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif + } + + template + void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { + const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM); + const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN); + const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM; + const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN; + + const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK); + StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK; + StorageIndex groupSizeK = + skinny + ? std::max(std::min(totalTilesK, + (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) / + (groupSizeM * groupSizeN)), + StorageIndex(1)) + : StorageIndex(1); + + const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK; + + const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK; + + const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + const StorageIndex globalRange = totalGroupSize * localRange; + + const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local) + ? ((Properties::DoubleBuffer + 1) * + (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) + + ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) * + (Properties::TileSizeDimN + Properties::BC)) + : StorageIndex(1); + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (groupSizeK == 1) { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + device() + .template binary_kernel_launcher( + lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim) + .wait(); + } else { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + CoeffReturnType *temp_pointer = static_cast( + device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device() + .template binary_kernel_launcher( + lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, + triple_dim) + .wait(); + + typedef Eigen::internal::SumReducer Op; + auto op = Op(); + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device() + .template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex( + Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))), + cl::sycl::range<1>(localRange)), + StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK) + .wait(); + device().deallocate_temp(temp_pointer); + } + } + +#ifndef EIGEN_SYCL_DISABLE_GEMV + template + void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, + StorageIndex NC, StorageIndex C) const { + const StorageIndex nonContractDim = NC; + EIGEN_CONSTEXPR StorageIndex NCFactor = 1; + EIGEN_CONSTEXPR StorageIndex CFactor = 1; + EIGEN_CONSTEXPR StorageIndex NCWindow = 16; + typedef Eigen::TensorSycl::internal::TVPanelSize + Properties; + const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC); + const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC); + const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC); + const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC); + const StorageIndex globalRange = + (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC)); + const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC; + const StorageIndex scratchSize = + (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (cNumGroups > 1) { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device() + .template binary_kernel_launcher( + vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C) + .wait(); + + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device() + .template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>( + cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)), + cl::sycl::range<1>(localRange)), + StorageIndex(1), Op(), nonContractDim, cNumGroups) + .wait(); + device().deallocate_temp(temp_pointer); + } else { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + device() + .template binary_kernel_launcher( + vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C) + .wait(); + } + } +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + template + EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + StorageIndex K) const { + EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + + // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread + // reduces at least 512 elementss individually, we get better performance. + const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1); + const StorageIndex global_range = num_work_group * local_range; + + typedef Eigen::TensorSycl::internal::GeneralScalarContraction< + CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false> + ContractKernelName; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(num_work_group * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + device() + .template binary_kernel_launcher(lhs, rhs, tmp_global_accessor, + thread_range, local_range, K) + .wait(); + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepFullReducer + GenericRKernel; + device() + .template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, + Op()) + .wait(); + device().deallocate_temp(temp_pointer); + } else { + device() + .template binary_kernel_launcher(lhs, rhs, buffer, thread_range, + local_range, K) + .wait(); + } + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { + this->m_leftImpl.cleanup(); + this->m_rightImpl.cleanup(); + + if (this->m_result) { + this->m_device.deallocate_temp(this->m_result); + this->m_result = NULL; + } + } +}; +} // namespace Eigen +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h new file mode 100644 index 00000000000..288d79f1f2a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -0,0 +1,1552 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H + +// evaluator for thread pool device +#ifdef EIGEN_USE_THREADS + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +template +struct TensorEvaluator, + ThreadPoolDevice> + : public TensorContractionEvaluatorBase, ThreadPoolDevice>> { + typedef ThreadPoolDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef std::remove_const_t Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + static constexpr int Layout = TensorEvaluator::Layout; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef std::conditional_t(Layout) == static_cast(ColMajor), LeftArgType, RightArgType> + EvalLeftArgType; + typedef std::conditional_t(Layout) == static_cast(ColMajor), RightArgType, LeftArgType> + EvalRightArgType; + + static constexpr int LDims = + internal::array_size::Dimensions>::value; + static constexpr int RDims = + internal::array_size::Dimensions>::value; + static constexpr int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static constexpr int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef std::remove_const_t LhsScalar; + typedef std::remove_const_t RhsScalar; + typedef typename internal::gebp_traits Traits; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + template + void evalProduct(Scalar* buffer) const { + evalProductImpl(buffer, NoCallback()); + } + + template + void evalProductAsync(Scalar* buffer, EvalToCallback done) const { + evalProductImpl(buffer, std::move(done)); + } + + template + void evalProductImpl(Scalar* buffer, DoneCallback done) const { + // This function computes a lot of heuristics in multiple steps, and it + // also has multiple exit points. To keep it sane, readable and all in one + // place, sync/async execution decision is made at runtime at the very end. + // + // (1) In sync mode we allocate Context on the stack, submit computations + // to the device thread pool, and block on a barrier until it is + // completed. + // + // (2) In async mode we allocate Context on the heap, and after all tasks + // are finished, we call provided the done callback, and delete a + // context from the heap. + // + // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state + // and temporary buffers, required for executing the tensor contraction. + // They are responsible for cleaning it up after contraction is done. + static const bool IsEvalInSyncMode = std::is_same::value; + + const Index m = this->m_i_size; + const Index n = this->m_j_size; + const Index k = this->m_k_size; + if (m == 0 || n == 0 || k == 0) return; + + // Compute a set of algorithm parameters: + // - kernel block sizes (bm, bn, bk) + // - task grain sizes (number of kernels executed per task: gm, gn) + // - number of threads + // - sharding by row/column + // - parallel packing or first lhs then rhs + // and some derived parameters: + // - number of tasks (nm, nn, nk) + // - number of kernels (nm0, nn0) + // Unfortunately, all these parameters are tightly interdependent. + // So in some cases we first compute approximate values, then compute other + // values based on these approximations and then refine the approximations. + + // There are lots of heuristics here. There is some reasoning behind them, + // but ultimately they are just tuned on contraction benchmarks for + // different input configurations, thread counts and instruction sets. + // So feel free to question any of them. + + // Compute whether we want to shard by row or by column. + // This is a first approximation, it will be refined later. Since we don't + // know number of threads yet we use 2, because what's we are most + // interested in at this point is whether it makes sense to use + // parallelization at all or not. + bool shard_by_col = shardByCol(m, n, 2); + + // First approximation of kernel blocking sizes. + // Again, we don't know number of threads yet, so we use 2. + Index bm, bn, bk; + if (shard_by_col) { + internal::TensorContractionBlocking blocking(k, m, n, + 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking blocking(k, m, n, + 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Compute optimal number of threads. + // Note: we use bk instead of k here because we are interested in amount of + // _parallelizable_ computations, and computations are not parallelizable + // across k dimension. + const TensorOpCost cost = contractionCost(m, n, bm, bn, bk, shard_by_col, false); + int num_threads = + TensorCostModel::numThreads(static_cast(n) * m, cost, this->m_device.numThreads()); + int num_threads_by_k = numThreadsInnerDim(m, n, k); + if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) { + // We are in the scenario where it is more effective to shard by the + // inner dimension. + if (IsEvalInSyncMode) { + EvalShardedByInnerDimContext ctx(this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx.template run(); + } else { + auto* ctx = + new EvalShardedByInnerDimContext(this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx->template runAsync(); + } + + return; + } + + // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost + // model is not tuned. Remove this when the cost model is tuned. + if (n == 1) num_threads = 1; + + if (num_threads == 1) { + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Unaligned, (buffer)); + if (!IsEvalInSyncMode) done(); + return; + } + + // Now that we know number of threads, recalculate sharding and blocking. + shard_by_col = shardByCol(m, n, num_threads); + if (shard_by_col) { + internal::TensorContractionBlocking blocking( + k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking blocking( + k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Number of kernels for each dimension. + Index nm0 = numext::div_ceil(m, bm); + Index nn0 = numext::div_ceil(n, bn); + Index nk = numext::div_ceil(k, bk); + + // Calculate task grain size (number of kernels executed per task). + // This task size coarsening serves two purposes: + // 1. It reduces per-task overheads including synchronization overheads. + // 2. It allows to use caches better (reuse the same packed rhs in several + // consecutive kernels). + Index gm = 1; + Index gn = 1; + // If we are sharding by column, then we prefer to reduce rows first. + if (shard_by_col) { + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + } else { + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + } + // Number of tasks in each dimension. + Index nm = numext::div_ceil(nm0, gm); + Index nn = numext::div_ceil(nn0, gn); + + // If there is enough concurrency in the sharding dimension, we choose not + // to paralellize by the other dimension, and execute all kernels in sync + // mode. This reduces parallelism from the nm x nn down to nn + // (shard_by_col==true) or nm (shard_by_col==false). + const Index sharding_dim_tasks = shard_by_col ? nn : nm; + const int num_worker_threads = this->m_device.numThreadsInPool(); + + // With small number of threads we want to make sure that we do not reduce + // parallelism too much. With large number of threads we trade maximum + // parallelism for better memory locality. + const float oversharding_factor = num_worker_threads <= 4 ? 8.0 + : num_worker_threads <= 8 ? 4.0 + : num_worker_threads <= 16 ? 2.0 + : num_worker_threads <= 32 ? 1.0 + : num_worker_threads <= 64 ? 0.8 + : /* num_worker_threads > 64 */ 0.6; + + const bool parallelize_by_sharding_dim_only = sharding_dim_tasks >= oversharding_factor * num_worker_threads; + + // Last by not least, decide whether we want to issue both lhs and rhs + // packing in parallel; or issue lhs packing first, and then issue rhs + // packing when lhs packing completes (for !shard_by_col lhs and rhs are + // swapped). Parallel packing allows more parallelism (for both packing and + // kernels), while sequential packing provides better locality (once + // a thread finishes rhs packing it proceed to kernels with that rhs). + // First, we are interested in parallel packing if there are few tasks. + bool parallel_pack = num_threads >= nm * nn; + // Also do parallel packing if all data fits into L2$. + if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= l2CacheSize() * num_threads) + parallel_pack = true; + // But don't do it if we will use each rhs only once. Locality seems to be + // more important in this case. + if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; + // Also don't get in the way of parallelize_by_sharding_dim_only + // optimization. + if (parallelize_by_sharding_dim_only) parallel_pack = false; + + // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext. + if (IsEvalInSyncMode) { +#define CONTEXT_ARGS \ + (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack, \ + parallelize_by_sharding_dim_only, NoCallback()) \ + .run() + TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment, CONTEXT_ARGS); +#undef CONTEXT_ARGS + + } else { +#define CONTEXT_ARGS \ + (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack, \ + parallelize_by_sharding_dim_only, std::move(done)) + TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback, Alignment, CONTEXT_ARGS, run()); +#undef CONTEXT_ARGS + } + } + + // ------------------------------------------------------------------------ // + + // Dummy struct to represent an empty DoneCallback. + + struct NoCallback { + void operator()() { eigen_assert(false && "NoCallback should never be called"); } + }; + + // ------------------------------------------------------------------------ // + + template + class EvalParallelNotification; + + // Synchronous evaluation notification that blocks caller thread in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context*, NoCallback) {} + void Notify() { done_.Notify(); } + void Wait() { done_.Wait(); } + + private: + Eigen::Notification done_; + }; + + // Asynchronous evaluation notification that does not block in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context* ctx, DoneCallback done) : ctx_(ctx), done_(std::move(done)) {} + + void Notify() { + // Make a copy of done callback, because it will be destructed when we + // will delete context in the next line (EvalParallelNotification is a + // data member of EvalParallelContext class). + DoneCallback done_copy = std::move(done_); + + // Delete parallel evaluation context. + delete ctx_; + + // Now safely call the done callback. + done_copy(); + } + + void Wait() {} + + private: + Context* ctx_; + DoneCallback done_; + }; + + // Context orchestrates sync/async parallel contraction evaluation. When it is + // executed in asynchronous mode, it owns all the shared state that might be + // accessible by block packing and kernel tasks. + + template + class EvalParallelContext { + public: + typedef internal::TensorContractionInputMapper::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + typedef internal::TensorContractionKernel + TensorContractionKernel; + + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + + EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, + Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0, + bool shard_by_col, bool parallel_pack, bool parallelize_by_sharding_dim_only, DoneCallback done) + : created_by_thread_id_(std::this_thread::get_id()), + done_(this, std::move(done)), + device_(self->m_device), + lhs_(self->m_leftImpl, self->m_left_nocontract_strides, self->m_i_strides, self->m_left_contracting_strides, + self->m_k_strides), + rhs_(self->m_rightImpl, self->m_right_nocontract_strides, self->m_j_strides, + self->m_right_contracting_strides, self->m_k_strides), + buffer_(buffer), + output_(buffer, tm), + output_kernel_(self->m_output_kernel), + tensor_contraction_params_(self->m_tensor_contraction_params), + num_threads_(num_threads), + shard_by_col_(shard_by_col), + parallel_pack_(parallel_pack), + parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only), + m_(tm), + n_(tn), + k_(tk), + bm_(bm), + bn_(bn), + bk_(bk), + nm_(nm), + nn_(nn), + nk_(nk), + gm_(gm), + gn_(gn), + nm0_(nm0), + nn0_(nn0), + kernel_(m_, k_, n_, bm_, bk_, bn_), + num_thread_local_allocations_(0), + // We reserve 2X more capacity for a thread local values, than the + // number of threads in the pool to efficiently handle task stealing + // by threads that are not managed by the pool. + thread_local_capacity(2 * (parallelize_by_sharding_dim_only_ ? device_.numThreadsInPool() : 0)), + // We will use only one of the Lhs/Rhs thread local storage depending + // on the shard_by_col value and we parallelize by sharding dim ONLY. + lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity, {*this}, {*this}), + rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0, {*this}, {*this}) { + // These two options are mutually exclusive. + eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only)); + + for (Index x = 0; x < P; x++) { + // Normal number of notifications for k slice switch is + // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only + // nm_ + nn_ notifications, because they will not receive notifications + // from preceding kernels. + state_switch_[x] = + x == 0 ? 1 : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + (x == P - 1 ? nm_ * nn_ : 0); + state_packing_ready_[x] = parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); + state_kernel_[x] = new std::atomic*[nm_]; + for (Index m = 0; m < nm_; m++) { + state_kernel_[x][m] = new std::atomic[nn_]; + // Kernels generally receive 3 notifications (previous kernel + 2 + // packing), but the first slice won't get notifications from previous + // kernels. + for (Index n = 0; n < nn_; n++) + state_kernel_[x][m][n].store((x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), std::memory_order_relaxed); + } + } + + // Allocate memory for packed rhs/lhs matrices. + packed_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/nm0_, // + /*num_rhs=*/nn0_, // + /*num_slices=*/std::min(nk_, P - 1), // + packed_lhs_, packed_rhs_); + + if (parallelize_by_sharding_dim_only_) { + const int num_worker_threads = device_.numThreadsInPool(); + + if (shard_by_col) { + can_use_thread_local_packed_ = new std::atomic[nn_]; + for (int i = 0; i < nn_; ++i) can_use_thread_local_packed_[i].store(true, std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gn_; + thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/0, // + /*num_rhs=*/num_blocks, // + /*num_slices=*/1, // + /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_); + + } else { + can_use_thread_local_packed_ = new std::atomic[nm_]; + for (int i = 0; i < nm_; ++i) can_use_thread_local_packed_[i].store(true, std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gm_; + thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/num_blocks, // + /*num_rhs=*/0, // + /*num_slices=*/1, &lhs_thread_local_pre_allocated_, // + /*rhs_blocks=*/nullptr); + } + } + } + + ~EvalParallelContext() { + for (Index x = 0; x < P; x++) { + for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; + delete[] state_kernel_[x]; + } + kernel_.deallocate(device_, packed_mem_); + if (parallelize_by_sharding_dim_only_) { + kernel_.deallocate(device_, thread_local_pre_alocated_mem_); + delete[] can_use_thread_local_packed_; + } + } + + void run() { + // Kick off packing of the first slice. + signal_switch(0, 1); + + // Wait for overall completion. + // + // If parallel evaluation is executed in async mode, this is a no-op, and + // Wait() will return immediately. In synchronous mode it will block the + // caller thread until it will receive notification from last task. + // + // In async mode, last task when completed will call done callback from + // the same thread, and will delete this context. + // + // TODO(dvyukov): This wait can lead to deadlock if contraction is + // evaluated in synchronous mode. If nthreads contractions are + // concurrently submitted from worker threads, this wait will block all + // worker threads and the system will deadlock. + done_.Wait(); + } + + private: + std::thread::id created_by_thread_id_; + + // This notification is specialized on the type of DoneCallback and can be + // blocking or non-blocking. + EvalParallelNotification done_; + + const Device& device_; + LhsMapper lhs_; + RhsMapper rhs_; + Scalar* const buffer_; + OutputMapper output_; + OutputKernelType output_kernel_; + TensorContractionParams tensor_contraction_params_; + const int num_threads_; + const bool shard_by_col_; + const bool parallel_pack_; + const bool parallelize_by_sharding_dim_only_; + // Matrix sizes. + const Index m_; + const Index n_; + const Index k_; + // Block sizes. + const Index bm_; + const Index bn_; + const Index bk_; + // Number of tasks. + const Index nm_; + const Index nn_; + const Index nk_; + // Task grain sizes (number of kernels executed per task). + const Index gm_; + const Index gn_; + // Number of blocks (this is different from ni_/nn_ because of task size + // coarsening). + const Index nm0_; + const Index nn0_; + // Tensor contraction kernel. + TensorContractionKernel kernel_; + + // Parallelization strategy. + // + // Blocks related to the same k block can run in parallel because they write + // to different output blocks. So we parallelize within k slices, this + // gives us parallelism level of m x n. Before we can start any kernels + // related to k-th slice, we need to issue m lhs packing tasks and n rhs + // packing tasks. + // + // However, there is a bottleneck when we are finishing kernels for k-th + // slice (at the very end there is only 1 runnable kernel). To mitigate this + // bottleneck we allow kernels from k-th and k+1-th slices to run in + // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same + // output block, so they must not run in parallel. + // + // This gives us the following dependency graph. + // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs + // packing tasks. + // Kernel (m, n, k) can start when: + // - kernel (m, n, k-1) has finished + // - lhs packing (m, k) has finished + // - rhs packing (n, k) has finished + // Lhs/rhs packing can start when: + // - all k-1 packing has finished (artificially imposed to limit amount of + // parallel packing) + // + // On top of that we limit runnable tasks to two consecutive k slices. + // This is done to limit amount of memory we need for packed lhs/rhs + // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). + // + // state_switch_ tracks when we are ready to switch to the next k slice. + // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). + // These variable are rolling over 3 consecutive k slices: first two we are + // actively executing + one to track completion of kernels in the second + // slice. + static constexpr Index P = 3; + + // Handle to the allocated temporary storage for Lhs/Rhs blocks. + BlockMemHandle packed_mem_; + std::vector packed_lhs_[P - 1]; + std::vector packed_rhs_[P - 1]; + + // If we choose to parallelize only by the sharding dimension, each thread + // will have it's own "thead local" (not a c++ thread local storage) memory + // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory + // can't be passed to a kernel that might execute on a different thread. + // + // In practice when we are ready to pack memory for the sharding dimension + // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice + // already computed (99% of the time), and we can pack data into the thread + // local storage, and guarantee that all the kernels will be executed + // immediately in the same thread. This significantly increases L1 cache hit + // ratio and reduces pressure on the memory bus. + // + // It's still possible that kernel for the K-th slice will be ready before + // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_ + // and packed_rhs_ to allow kernels to be executed later on a thread + // different from the thread that was used for packing. + + // Handle for pre-allocated thread local memory buffers. + BlockMemHandle thread_local_pre_alocated_mem_; + + // Only one of these will be initialized depending on shard_by_col value + // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`). + std::vector lhs_thread_local_pre_allocated_; + std::vector rhs_thread_local_pre_allocated_; + + // How many thread local blocks were already allocated. + std::atomic num_thread_local_allocations_; + const int thread_local_capacity; + + // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of + // unique threads in a system is below or equal to the number of threads in + // a thread pool. We will fallback on dynamic memory allocation after that. + + // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its + // size is equal to the grain size in Lhs/Rhs sharding dimension. + template + class ThreadLocalBlocks { + public: + ThreadLocalBlocks() = default; + + ThreadLocalBlocks(BlockType* base, size_t grain_size) + : is_pre_allocated_(true), thread_local_pre_allocated_base_(base), grain_size_(grain_size) {} + + ThreadLocalBlocks(BlockMemHandle mem_handle, std::vector blocks) + : is_pre_allocated_(false), mem_handle_(std::move(mem_handle)), blocks_(std::move(blocks)) {} + + BlockType& block(int grain_index) { + eigen_assert(grain_index >= 0); + eigen_assert(static_cast(grain_index) < size()); + return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index] : blocks_[grain_index]; + } + + void Release(EvalParallelContext& ctx) const { + if (!is_pre_allocated_) { + ctx.kernel_.deallocate(ctx.device_, mem_handle_); + } + } + + size_t size() const { return is_pre_allocated_ ? grain_size_ : blocks_.size(); } + + private: + bool is_pre_allocated_; + + // Reuse pre-allocated thread local buffers. + BlockType* thread_local_pre_allocated_base_ = nullptr; + size_t grain_size_ = 0; + + // These will be initialized only if `is_pre_allocated == false`. + BlockMemHandle mem_handle_{}; + std::vector blocks_; + }; + + // ThreadLocalBlocksInitialize callable does custom thread local blocks + // initialization, and will reuse pre-allocated buffers if possible, or will + // dynamically allocate new memory. + // + // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly + // for what side do we plan to do block allocation. + template + class ThreadLocalBlocksInitialize { + static constexpr bool kIsLhs = !is_rhs && std::is_same::value; + static const bool kIsRhs = is_rhs && std::is_same::value; + static_assert(kIsLhs || kIsRhs, "Unknown block type"); + + using Blocks = ThreadLocalBlocks; + + public: + ThreadLocalBlocksInitialize(EvalParallelContext& ctx) + : ctx_(ctx), num_worker_threads_(ctx_.device_.numThreadsInPool()) {} + + void operator()(Blocks& blocks) { + const int n = ctx_.num_thread_local_allocations_.fetch_add(1, std::memory_order_relaxed); + + if (n >= num_worker_threads_) { + ThreadLocalBlocksAllocator::allocate(ctx_, blocks); + } else { + ThreadLocalBlocksAllocator::reuse(ctx_, n, blocks); + } + } + + private: + // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to + // TensorContractionKernel::allocateSlices into template specializations. + // Also explicit specializations are not allowed at class scope in C++03, + // EvalCtx type parameter is just a workaround for that limitation. + template + struct ThreadLocalBlocksAllocator; + + template + struct ThreadLocalBlocksAllocator { + static void allocate(EvalCtx& ctx, Blocks& blocks) { + std::vector rhs_blocks; + BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(ctx.device_, + /*num_lhs=*/0, + /*num_rhs=*/ctx.gn_, + /*num_slices=*/1, + /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks); + + blocks = ThreadLocalBlocks(std::move(mem_handle), std::move(rhs_blocks)); + } + + static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { + RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index]; + blocks = ThreadLocalBlocks(ptr, ctx.gn_); + } + }; + + template + struct ThreadLocalBlocksAllocator { + static void allocate(EvalCtx& ctx, Blocks& blocks) { + std::vector lhs_blocks; + BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(ctx.device_, + /*num_lhs=*/ctx.gm_, + /*num_rhs=*/0, + /*num_slices=*/1, + /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr); + + blocks = ThreadLocalBlocks(std::move(mem_handle), std::move(lhs_blocks)); + } + + static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { + LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index]; + blocks = ThreadLocalBlocks(ptr, ctx.gm_); + } + }; + + EvalParallelContext& ctx_; + const int num_worker_threads_; + }; + + template + class ThreadLocalBlocksRelease { + public: + using Blocks = ThreadLocalBlocks; + ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {} + void operator()(Blocks& blocks) { blocks.Release(ctx_); } + + private: + EvalParallelContext& ctx_; + }; + + // ThreadLocalBlocks initialization callables. + using ThreadLocalLhsInit = ThreadLocalBlocksInitialize; + using ThreadLocalRhsInit = ThreadLocalBlocksInitialize; + + // ThreadLocalBlocks release callables. + using ThreadLocalLhsRelease = ThreadLocalBlocksRelease; + using ThreadLocalRhsRelease = ThreadLocalBlocksRelease; + + // Thread local containers for Lhs/Rhs block packs. In practice only one of + // them will be used, depending on the shard_by_col value. + Eigen::ThreadLocal, ThreadLocalLhsInit, ThreadLocalLhsRelease> lhs_thread_local_blocks_; + Eigen::ThreadLocal, ThreadLocalRhsInit, ThreadLocalRhsRelease> rhs_thread_local_blocks_; + + // After a particular shard for Kth slice missed thread local execution + // opportunity (K-1 slice didn't complete kernels execution), we can no + // longer schedule K+1 and following slices in thread local mode, because + // there is no more guarantee that previous kernels were executed + // sequentially in the same thread (size is nn_ or nm_). + std::atomic* can_use_thread_local_packed_; + + std::atomic** state_kernel_[P]; + // state_switch_ is frequently modified by worker threads, while other + // fields are read-only after constructor. Let's move it to a separate cache + // line to reduce cache-coherency traffic. + char pad_[128]; + std::atomic state_packing_ready_[P]; + std::atomic state_switch_[P]; + + LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(!shard_by_col_); + ThreadLocalBlocks& blocks = lhs_thread_local_blocks_.local(); + + Index grain_index = m1 - m * gm_; + return blocks.block( + internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? + } else { + return packed_lhs_[k % (P - 1)][m1]; + } + } + + RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(shard_by_col_); + ThreadLocalBlocks& blocks = rhs_thread_local_blocks_.local(); + + Index grain_index = n1 - n * gn_; + return blocks.block( + internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? + } else { + return packed_rhs_[k % (P - 1)][n1]; + } + } + + // In following two methods (pack_lhs and pack_rhs), if we know for sure + // that we'll be able to immediately call a kernel with packed data, and do + // not submit it to the thread pool, we can use thread local memory for + // packed data. + // + // We can only reliably check it if we are running all kernels in sync mode + // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to + // run, it's guaranteed that all kernels with larger values of m (n) are + // also ready, because we execute them in the same order for all K slices. + + void pack_lhs(Index m, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && !shard_by_col_ && + can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in following slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[m].store(false, std::memory_order_relaxed); + } + } + + const Index mend = m * gm_ + gm(m); + for (Index m1 = m * gm_; m1 < mend; m1++) + kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local), lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + + if (!parallel_pack_ && shard_by_col_) { + eigen_assert(!use_thread_local); + signal_packing(k); + } else { + signal_switch(k + 1); + for (Index n = nn_ - 1; n >= 0; n--) { + bool sync = parallelize_by_sharding_dim_only_ || n == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } + } + } + + void pack_rhs(Index n, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && shard_by_col_ && + can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in following slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[n].store(false, std::memory_order_relaxed); + } + } + + const Index nend = n * gn_ + gn(n); + for (Index n1 = n * gn_; n1 < nend; n1++) { + if (!TensorContractionKernel::HasBeta && k == 0) { + // Zero the output memory in parallel, only if contraction kernel does + // not support `beta`. Otherwise we will pass beta 0.0 to the first + // call to the `TensorContractionKernel::invoke()`. + // + // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn + // x m) row. Safe to do here because all kernels that will write to + // this memory depend on completion of this task. Note: don't call + // device_.fill() here. device_.fill() blocks on thread pool + // worker thread, which can lead to underutilization and deadlocks. + std::fill_n(buffer_ + n1 * bn_ * m_, bn(n1) * m_, Scalar(0)); + } + kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local), rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + } + + if (parallel_pack_ || shard_by_col_) { + signal_switch(k + 1); + for (Index m = nm_ - 1; m >= 0; m--) { + bool sync = parallelize_by_sharding_dim_only_ || m == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } + } else { + eigen_assert(!use_thread_local); + signal_packing(k); + } + } + + void kernel(Index m, Index n, Index k, bool use_thread_local) { + // Note: order of iteration matters here. Iteration over m is innermost + // because we want to reuse the same packed rhs in consecutive tasks + // (rhs fits into L2$ while lhs only into L3$). + const Index nend = n * gn_ + gn(n); + const Index mend = m * gm_ + gm(m); + + // NOTE: output = alpha * LHS * RHS + beta * output. + const Scalar alpha = Scalar(1); + const Scalar beta = (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1); + + if (shard_by_col_) { + for (Index n1 = n * gn_; n1 < nend; n1++) { + for (Index m1 = m * gm_; m1 < mend; m1++) { + const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); + kernel_.invoke(output_mapper, packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), bk(k), bn(n1), alpha, beta); + + // We are done with the last task for the [m1, n1] block. + if (k + 1 == nk_) { + output_kernel_(output_mapper, tensor_contraction_params_, m1 * bm_, n1 * bn_, bm(m1), bn(n1)); + } + } + } + } else { + for (Index m1 = m * gm_; m1 < mend; m1++) + for (Index n1 = n * gn_; n1 < nend; n1++) { + const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); + kernel_.invoke(output_mapper, packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), bk(k), bn(n1), alpha, beta); + + // We are done with the last task for the [m1, n1] block. + if (k + 1 == nk_) { + output_kernel_(output_mapper, tensor_contraction_params_, m1 * bm_, n1 * bn_, bm(m1), bn(n1)); + } + } + } + signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false); + signal_switch(k + 2); + } + + void signal_packing(Index k) { + eigen_assert(!parallel_pack_); + Index s = state_packing_ready_[k % P].fetch_sub(1); + eigen_assert(s > 0); + if (s != 1) return; + state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; + enqueue_packing(k, shard_by_col_); + } + + void signal_kernel(Index m, Index n, Index k, bool sync, bool use_thread_local) { + std::atomic* state = &state_kernel_[k % P][m][n]; + Index s = state->load(); + eigen_assert(s > 0); + if (s != 1 && state->fetch_sub(1) != 1) { + eigen_assert(!use_thread_local); + return; + } + state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); + if (sync) { + kernel(m, n, k, use_thread_local); + } else { + eigen_assert(!use_thread_local); + device_.enqueueNoNotification([this, m, n, k, use_thread_local]() { + kernel(m, n, k, use_thread_local); + }); + } + } + + void signal_switch(Index k, Index v = 1) { + Index s = state_switch_[k % P].fetch_sub(v); + eigen_assert(s >= v); + if (s != v) return; + + // Ready to switch to the next k slice. + // Reset counter for the next iteration. + state_switch_[k % P] = (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + nm_ * nn_; + if (k < nk_) { + // Issue lhs/rhs packing. Their completion will in turn kick off + // kernels. + if (parallel_pack_) { + enqueue_packing(k, !shard_by_col_); + enqueue_packing(k, shard_by_col_); + } else if (shard_by_col_) { + enqueue_packing(k, false); + } else { + enqueue_packing(k, true); + } + + // Termination handling. + // Because kernel completion signals k + 2 switch, we need to finish nk + // + 2 slices without issuing any tasks on nk + 1 slice. So here we + // pretend that all nk + 1 packing tasks just finish instantly; so that + // nk + 2 switch only waits for completion of nk kernels. + } else if (k == nk_) { + signal_switch(k + 1, parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); + } else { + done_.Notify(); + } + } + + // Enqueue all rhs/lhs packing for k-th slice. + void enqueue_packing(Index k, bool rhs) { enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); } + + void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { + if (end - start == 1) { + if (rhs) + pack_rhs(start, k); + else + pack_lhs(start, k); + } else { + while (end - start > 1) { + Index mid = (start + end) / 2; + device_.enqueueNoNotification([this, mid, end, k, rhs]() { + enqueue_packing_helper(mid, end, k, rhs); + }); + end = mid; + } + + // Decide if we want to run first packing task (start == 0) in + // async mode if we parallelize only by sharding dim: + // (1) pack_lhs and pack_rhs call signal_switch before completing + // all calls to signal_kernel, which in sync mode might lead + // to the execution of the first kernel of the k+1 slice, before + // completing a call to the last kernel of the k slice. + // (2) all pack tasks for sharded dim must be executed in a thread + // pool to get pre-allocated thead local buffers. + bool pack_async = (start == 0) && (parallelize_by_sharding_dim_only_ && shard_by_col_ == rhs) && + (k > 0 || std::this_thread::get_id() == created_by_thread_id_); + + if (pack_async) { + device_.enqueueNoNotification([this, start, end, k, rhs]() { + enqueue_packing_helper(start, end, k, rhs); + }); + } else { + enqueue_packing_helper(start, end, k, rhs); + } + } + } + + // Block sizes with accounting for potentially incomplete last block. + Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } + Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } + Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } + // Task grain sizes accounting for potentially incomplete last task. + Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } + Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } + + EvalParallelContext(const EvalParallelContext&) = delete; + void operator=(const EvalParallelContext&) = delete; + }; + + template + using SyncEvalParallelContext = EvalParallelContext; + + // ------------------------------------------------------------------------ // + + // EvalShardedByInnerDimContext orchestrates sync/async contraction + // evaluation, when we shard by inner dimension. When it is executed in + // asynchronous mode, it owns all the shared state that might be accessible by + // block processing tasks. + + template + struct EvalShardedByInnerDimContext { + EvalShardedByInnerDimContext(const Self* self, int num_threads, Scalar* result_buffer, Index m_size, Index n_size, + Index k_size, DoneCallback done_callback) + : evaluator(self), + m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous), + m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous), + m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered), + result(result_buffer), + m(m_size), + n(n_size), + k(k_size), + done(std::move(done_callback)), + buffer_size_bytes(m * n * sizeof(Scalar)), + block_size(blockSize(k, num_threads)), + num_blocks(numext::div_ceil(k, block_size)), + num_pending_blocks(internal::convert_index(num_blocks)), + l0_ranges(numext::div_ceil(num_blocks, l0_size)), + l0_state(l0_ranges), + block_buffers(num_blocks) { + // Keep count of pending gemm tasks for each l0 range. + for (int i = 0; i < l0_ranges; ++i) { + const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i); + l0_state.emplace_back(internal::convert_index(num_pending_tasks)); + } + + // Allocate temporary buffers for each block. + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + Scalar* buf = block_idx == 0 ? result : static_cast(evaluator->m_device.allocate(buffer_size_bytes)); + block_buffers.emplace_back(buf); + } + } + + ~EvalShardedByInnerDimContext() { + for (Index i = 1; i < num_blocks; ++i) { + evaluator->m_device.deallocate(block_buffers[i]); + } + } + + template + void run() { + Barrier barrier(internal::convert_index(num_blocks)); + eval(barrier, 0, num_blocks); + barrier.Wait(); + + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + } + + template + void runAsync() { + evalAsync(0, num_blocks); + } + + private: + // The underlying GEMM kernel assumes that k is a multiple of + // the packet size and subtle breakage occurs if this is violated. + static const Index packet_size = internal::packet_traits::size; + + const Self* evaluator; // TensorContraction evaluator + + // These fields required fromTENSOR_CONTRACTION_DISPATCH macro. + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + Scalar* result; + + Index m; + Index n; + Index k; + + DoneCallback done; + + // ----------------------------------------------------------------------// + // Algorithm parameters. + + // We will compute partial results into the buffers of this size. + Index buffer_size_bytes; + + Index block_size; + Index num_blocks; + + // Keep track of pending tasks when evaluate in async mode. + std::atomic num_pending_blocks; + + // We compute partial gemm results in parallel, and to get the final result + // we need to add them all together. For the large number of threads (>= 48) + // this adds a very expensive sequential step at the end. + // + // We split the [0, num_blocks) into small ranges, and when a task for the + // block finishes its partial gemm computation, it checks if it was the last + // gemm in the range, and if so, it will add all blocks of the range. + // + // After all tasks done, we need to add only these pre-aggregated blocks. + + // For now we use just a single level of ranges to compute pre-aggregated + // partial sums, but in general we can use more layers to compute tree + // aggregation in parallel and reduce the size of the sequential step. + // + // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make + // sense only if number of threads >= ~128? + static const Index l0_size = 4; + Index l0_ranges; + + // Keep count of pending gemm tasks for each l0 range. + MaxSizeVector> l0_state; // [0, l0_ranges) + + // Buffers allocated for each temporary block computation. + MaxSizeVector block_buffers; // [0, num_blocks) + + template + void processBlock(Index block_idx, Index begin, Index end) { + Scalar* buf = block_buffers[block_idx]; + + TENSOR_CONTRACTION_DISPATCH(evaluator->template evalGemmPartialWithoutOutputKernel, Alignment, + (buf, begin, end, + /*num_threads=*/internal::convert_index(num_blocks))); + + // Check if it was the last task in l0 range. + const Index l0_index = block_idx / l0_size; + const int v = l0_state[l0_index].fetch_sub(1); + eigen_assert(v >= 1); + + // If we processed the last block of the range, we can aggregate all + // partial results into the first block of the range. + if (v == 1) { + const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index); + const Index dst_block_idx = l0_index * l0_size; + + if (rng_size == l0_size) { + addAllToBuffer(m * n, + /*src_buf0=*/block_buffers[dst_block_idx + 1], + /*src_buf1=*/block_buffers[dst_block_idx + 2], + /*src_buf2=*/block_buffers[dst_block_idx + 3], + /*dst_buf= */ block_buffers[dst_block_idx]); + } else { + // Aggregate blocks of potentially incomplete last range. + for (int i = 1; i < rng_size; ++i) { + addToBuffer(m * n, + /*src_buf=*/block_buffers[dst_block_idx + i], + /*dst_buf=*/block_buffers[dst_block_idx]); + } + } + } + } + + // Aggregate partial sums from l0 ranges. + template + void aggregateL0Blocks() const { + Index l0_index = 1; + + for (; l0_index + 2 < l0_ranges; l0_index += 3) { + addAllToBuffer(m * n, + /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], + /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], + /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], + /*dst_buf= */ block_buffers[0]); + } + + for (; l0_index < l0_ranges; ++l0_index) { + addToBuffer(m * n, block_buffers[l0_index * l0_size], block_buffers[0]); + } + } + + void applyOutputKernel() const { + typedef internal::blas_data_mapper OutputMapper; + evaluator->m_output_kernel(OutputMapper(result, m), evaluator->m_tensor_contraction_params, + static_cast(0), static_cast(0), m, n); + } + + // Compute block size with accounting for potentially incomplete last block. + Index actualBlockSize(Index block_idx) const { + return block_idx + 1 < num_blocks ? block_size : k + block_size - block_size * num_blocks; + }; + + // Compute range size with accounting for potentially incomplete last range. + Index actualRangeSize(Index num_ranges, Index range_size, Index range_idx) const { + eigen_assert(range_idx < num_ranges); + return range_idx + 1 < num_ranges ? range_size : num_blocks + range_size - range_size * num_ranges; + }; + + template + EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf, Scalar* tgt_buf) { + const int output_packet_size = internal::unpacket_traits::size; + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const PacketReturnType src_val = internal::pload(src_buf + i); + const PacketReturnType tgt_val = internal::ploadt(tgt_buf + i); + const PacketReturnType sum = internal::padd(src_val, tgt_val); + internal::pstoret(tgt_buf + i, sum); + } + for (; i < n; ++i) { + tgt_buf[i] += src_buf[i]; + } + } + + template + EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n, const Scalar* src_buf0, const Scalar* src_buf1, + const Scalar* src_buf2, Scalar* dst_buf) { + using ::Eigen::internal::padd; + using ::Eigen::internal::pload; + using ::Eigen::internal::ploadt; + using ::Eigen::internal::pstoret; + + const int output_packet_size = internal::unpacket_traits::size; + + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const auto src_val0 = pload(src_buf0 + i); + const auto src_val1 = pload(src_buf1 + i); + const auto src_val2 = pload(src_buf2 + i); + + const auto dst_val = ploadt(dst_buf + i); + const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); + + pstoret(dst_buf + i, sum); + } + for (; i < n; ++i) { + dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; + } + } + + template + void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) { + while (end_block_idx - start_block_idx > 1) { + Index mid_block_idx = (start_block_idx + end_block_idx) / 2; + evaluator->m_device.enqueueNoNotification([this, &barrier, mid_block_idx, end_block_idx]() { + eval(barrier, mid_block_idx, end_block_idx); + }); + end_block_idx = mid_block_idx; + } + + Index block_idx = start_block_idx; + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + barrier.Notify(); + } + + template + void evalAsync(Index start_block_idx, Index end_block_idx) { + while (end_block_idx - start_block_idx > 1) { + Index mid_block_idx = (start_block_idx + end_block_idx) / 2; + evaluator->m_device.enqueueNoNotification( + [this, mid_block_idx, end_block_idx]() { + evalAsync(mid_block_idx, end_block_idx); + }); + end_block_idx = mid_block_idx; + } + + Index block_idx = start_block_idx; + + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + + int v = num_pending_blocks.fetch_sub(1); + eigen_assert(v >= 1); + + if (v == 1) { + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + + // NOTE: If we call `done` callback before deleting this (context), + // it might deallocate Self* pointer captured by context, and we'll + // fail in destructor trying to deallocate temporary buffers. + + // Move done call back from context before it will be destructed. + DoneCallback done_copy = std::move(done); + + // We are confident that we are the last one who touches context. + delete this; + + // Now safely call the done callback. + done_copy(); + } + } + + // Cost model doesn't capture well the cost associated with constructing + // tensor contraction mappers and computing loop bounds in gemm_pack_lhs + // and gemm_pack_rhs, so we specify minimum desired block size. + static Index blockSize(Index k, int num_threads) { + const auto round_up = [=](Index index) -> Index { + const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + return numext::div_ceil(index, kmultiple) * kmultiple; + }; + + const Index target_block_size = round_up(numext::div_ceil(k, num_threads)); + const Index desired_min_block_size = 12 * packet_size; + + return numext::mini(k, numext::maxi(desired_min_block_size, target_block_size)); + } + + EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete; + void operator=(const EvalShardedByInnerDimContext&) = delete; + }; + + // ------------------------------------------------------------------------ // + + // Below are the function used by evalProductImpl heuristics, trying to select + // optimcal parameters for parallelization algorithm. + + // Decide whether we want to shard m x n contraction by columns or by rows. + static bool shardByCol(Index m, Index n, Index num_threads) { + // Note: we are comparing both n and m against Traits::nr, it is not + // a mistake. We are trying to figure out how both n and m will fit into + // the main sharding dimension. + + // Sharding by column is the default + // ... unless there is enough data for vectorization over rows + if (m / num_threads >= Traits::nr && + // and not enough data for vectorization over columns + (n / num_threads < Traits::nr || + // ... or barely enough data for vectorization over columns, + // but it is not evenly dividable across threads + (n / num_threads < 4 * Traits::nr && (n % (num_threads * Traits::nr)) != 0 && + // ... and it is evenly dividable across threads for rows + ((m % (num_threads * Traits::nr)) == 0 || + // .. or it is not evenly dividable for both dimensions but + // there is much more data over rows so that corner effects are + // mitigated. + (m / n >= 6))))) + return false; + // Wait, or if matrices are just substantially prolonged over the other + // dimension. + if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; + return true; + } + + Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, int num_threads, bool shard_by_col) const { + Index gm = 1; + Index gm1 = 1; + Index nm0 = numext::div_ceil(m, bm); + Index nm1 = nm0; + for (;;) { + // Find the next candidate for m grain size. It needs to result in + // different number of blocks. E.g. if we have 10 kernels, we want to try + // 5 and 10, but not 6, 7, 8 and 9. + while (gm1 <= nm0 && nm1 == numext::div_ceil(nm0, gm1)) gm1++; + if (gm1 > nm0) break; + // Check the candidate. + int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, shard_by_col); + if (res < 0) break; + nm1 = numext::div_ceil(nm0, gm1); + if (res == 0) continue; + // Commit new grain size. + gm = gm1; + } + return gm; + } + + Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, int num_threads, bool shard_by_col) const { + Index gn = 1; + Index gn1 = 1; + Index nn0 = numext::div_ceil(n, bn); + Index nn1 = nn0; + for (;;) { + while (gn1 <= nn0 && nn1 == numext::div_ceil(nn0, gn1)) gn1++; + if (gn1 > nn0) break; + int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, shard_by_col); + if (res < 0) break; + nn1 = numext::div_ceil(nn0, gn1); + if (res == 0) continue; + gn = gn1; + } + return gn; + } + + // checkGrain checks whether grain (gm, gn) is suitable and is better than + // (oldgm, oldgn). + int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, Index gn, Index oldgm, Index oldgn, + int num_threads, bool shard_by_col) const { + const TensorOpCost cost = contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); + double taskSize = TensorCostModel::taskSize(static_cast(bm) * gm * bn * gn, cost); + // If the task is too small, then we agree on it regardless of anything + // else. Otherwise synchronization overheads will dominate. + if (taskSize < 1) return 1; + // If it is too large, then we reject it and all larger tasks. + if (taskSize > 2) return -1; + // Now we are in presumably good task size range. + // The main deciding factor here is parallelism. Consider that we have 12 + // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. + // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 + // of cores will be busy). While grain size 3 gives us 4 tasks, which gives + // us parallelism of 1 (we can load all cores). + Index nm0 = numext::div_ceil(m, bm); + Index nn0 = numext::div_ceil(n, bn); + Index new_tasks = numext::div_ceil(nm0, gm) * numext::div_ceil(nn0, gn); + double new_parallelism = + static_cast(new_tasks) / (numext::div_ceil(new_tasks, num_threads) * num_threads); + Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn); + double old_parallelism = + static_cast(old_tasks) / (numext::div_ceil(old_tasks, num_threads) * num_threads); + if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; + return 0; + } + + TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, bool shard_by_col, + bool prepacked) const { + const int packed_size = std::min(PacketType::size, PacketType::size); + const int output_packet_size = internal::unpacket_traits::size; + const double kd = static_cast(bk); + double compute_bandwidth = computeBandwidth(false, bm, bn, bk); + // Computations. + TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + if (prepacked) { + // Packing and kernels are executed in different tasks. When we calculate + // task grain size we look only at kernel cost assuming that kernel + // is more expensive than packing. + return cost; + } + // Lhs/rhs loads + computations. + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); + // Lhs packing memory cost does not contribute considerably to overall + // execution time because lhs is prefetched early and accessed sequentially. + if (shard_by_col) + lhsCost.dropMemoryCost(); + else + rhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } + + // Decide whether we want to shard m x k x n contraction over the inner + // (contraction) dimension (k). + static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, int num_threads_by_k) { + std::ptrdiff_t bufsize = m * n * sizeof(Scalar); + bool shard_by_k = false; + if (n == 1 || // If mat*vec or... + num_threads_by_k < 2 || // running single threaded or... + num_threads_by_k < num_threads || // sharding by k gives less parallelism or... + bufsize > l3CacheSize() / num_threads_by_k || // need more buffer space + // than L3 cache or... + k / num_threads_by_k < 2 * Traits::nr) { // k per thread is tiny. + shard_by_k = false; + } else if (numext::maxi(m, n) / num_threads < Traits::nr || // both other dimensions are tiny or... + // k per thread is not small and... + (k / num_threads_by_k > 8 * Traits::nr && + // one of the outer dimensions is tiny or sharding by k offers + // more parallelism. + (numext::mini(m, n) < 2 * Traits::nr || num_threads_by_k > num_threads))) { + shard_by_k = true; + } + return shard_by_k; + } + + TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const { + // Compute cost. + const int output_packet_size = internal::unpacket_traits::size; + TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m; + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n; + // Since the inner gemm kernel is always sharded by column, the lhs + // load cost is negligible. + lhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } + + int numThreadsInnerDim(Index m, Index n, Index k) const { + const int output_packet_size = internal::unpacket_traits::size; + TensorOpCost cost = contractionCostPerInnerDim(m, n, k); + double total_parallel_cost = TensorCostModel::totalCost(k, cost); + // Cost of reduction step accumulating the m*n per-thread buffers into the + // result. + double reduction_cost = + TensorCostModel::totalCost(m * n, TensorOpCost(2, 1, 1, true, output_packet_size)); + int num_threads = 1; + double min_cost = total_parallel_cost; + double kPerThreadOverHead = 3000; + double kFixedOverHead = 100000; + for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) { + double sequential_cost = kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead); + double parallel_cost = total_parallel_cost / nt + sequential_cost; + if (parallel_cost < min_cost) { + num_threads = nt; + min_cost = parallel_cost; + } + } + return num_threads; + } + + double computeBandwidth(bool shard_by_col, Index bm, Index bn, Index bk) const { + // Peak VFMA bandwidth is 0.5. However if we have not enough data for + // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined + // experimentally. + double computeBandwidth = bk == 1 ? 4.0 + : (shard_by_col ? bn : bm) < Traits::nr || (shard_by_col ? bm : bn) < Traits::mr ? 2.0 + : 0.5; +#ifndef EIGEN_VECTORIZE_FMA + // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. + // However for MULPS/ADDPS we have dependent sequence of 2 such + // instructions, + // so overall bandwidth is 1.0. + if (computeBandwidth == 0.5) computeBandwidth = 1.0; +#endif + return computeBandwidth; + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_THREADS +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h new file mode 100644 index 00000000000..5e4a586d8b1 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -0,0 +1,416 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorConversionOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor conversion class. This class makes it possible to vectorize + * type casting operations when the number of scalars per packet in the source + * and the destination type differ + */ +namespace internal { +template +struct traits > { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef TargetType Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = traits::Layout; + enum { Flags = 0 }; + typedef typename TypeConversion::PointerType>::type PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorConversionOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorConversionOp type; +}; + +} // end namespace internal + +template +struct PacketConverter; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + return internal::pcast(m_impl.template packet(index)); + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + 1 * SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + SrcPacket src5 = m_impl.template packet(index + 4 * SrcPacketSize); + SrcPacket src6 = m_impl.template packet(index + 5 * SrcPacketSize); + SrcPacket src7 = m_impl.template packet(index + 6 * SrcPacketSize); + SrcPacket src8 = m_impl.template packet(index + 7 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4, src5, src6, src7, src8); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) + : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + // Only call m_impl.packet() when we have direct access to the underlying data. This + // ensures that we don't compute the subexpression twice. We may however load some + // coefficients twice, but in practice this doesn't negatively impact performance. + if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) { + // Force unaligned memory loads since we can't ensure alignment anymore + return internal::pcast(m_impl.template packet(index)); + } else { + const int TgtPacketSize = internal::unpacket_traits::size; + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TgtType; + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < TgtPacketSize; ++i) { + values[i] = converter(m_impl.coeff(index + i)); + } + TgtPacket rslt = internal::pload(values); + return rslt; + } + } + + private: + const TensorEvaluator& m_impl; + const typename TensorEvaluator::Index m_maxIndex; +}; + +template +class TensorConversionOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef Scalar CoeffReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) : m_xpr(xpr) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +template +struct ConversionSubExprEval { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { + impl.evalSubExprsIfNeeded(NULL); + return true; + } +}; + +template +struct ConversionSubExprEval { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { return impl.evalSubExprsIfNeeded(data); } +}; + +#ifdef EIGEN_USE_THREADS +template +struct ConversionSubExprEvalAsync { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) { + impl.evalSubExprsIfNeededAsync(nullptr, std::move(done)); + } +}; + +template +struct ConversionSubExprEvalAsync { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { + impl.evalSubExprsIfNeededAsync(data, std::move(done)); + } +}; +#endif + +namespace internal { + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, + Index index) { + internal::scalar_cast_op converter; + return converter(impl.coeff(index)); + } +}; + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, + Index index) { + return impl.coeff(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + static constexpr int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, + Index index) { + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index + i)); + } + TargetPacket rslt = internal::pload(values); + return rslt; + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, + Index index) { + const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + PacketConverter, SrcPacket, TargetPacket, SrcCoeffRatio, TgtCoeffRatio> converter( + impl); + return converter.template packet(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type TargetType; + static constexpr int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, + Index index) { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index + i); + return internal::pload(values); + } +}; + +template +struct PacketConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, + Index index) { + return impl.template packet(index); + } +}; + +} // namespace internal + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorConversionOp XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef TargetType Scalar; + typedef TargetType CoeffReturnType; + typedef internal::remove_all_t::Scalar> SrcType; + typedef typename PacketType::type PacketReturnType; + typedef typename PacketType::type PacketSourceType; + static constexpr int PacketSize = PacketType::size; + static constexpr bool IsSameType = internal::is_same::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = +#ifndef EIGEN_USE_SYCL + true, +#else + TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast, +#endif + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + RawAccess = false + }; + + static constexpr int Layout = TensorEvaluator::Layout; + static constexpr int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + struct TensorConversionOpBlockFactory { + template + struct XprType { + typedef TensorConversionOp type; + }; + + template + typename XprType::type expr(const ArgXprType& expr) const { + return typename XprType::type(expr); + } + }; + + typedef internal::TensorUnaryExprBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + return ConversionSubExprEval, EvaluatorPointerType>::run(m_impl, data); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) { + ConversionSubExprEvalAsync, EvaluatorPointerType, + EvalSubExprsCallback>::run(m_impl, data, std::move(done)); + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return internal::CoeffConv::run(m_impl, index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + // If we are not going to do the cast, we just need to check that base + // TensorEvaluator has packet access. Otherwise we also need to make sure, + // that we have an implementation of vectorized cast. + const bool Vectorizable = IsSameType ? TensorEvaluator::PacketAccess + : int(TensorEvaluator::PacketAccess) & + int(internal::type_casting_traits::VectorizedCast); + + return internal::PacketConv::run(m_impl, + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double cast_cost = TensorOpCost::CastCost(); + if (vectorized) { + const double SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const double TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + + TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); + } else { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return m_impl.getResourceRequirements(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_impl.block(desc, scratch), TensorConversionOpBlockFactory()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h new file mode 100644 index 00000000000..26984b69090 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -0,0 +1,1123 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + +template +class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array& kernel_dims, + const array& indices) { + array dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array inputStrides; + array outputStrides; + if (static_cast(Layout) == static_cast(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; + outputStrides[i] = outputStrides[i - 1] * dimensions[i - 1]; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; + } + } + + array gpuInputDimensions; + array gpuOutputDimensions; + array tmp = dimensions; + array ordering; + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = i + offset; + ordering[index] = indices[i]; + tmp[indices[i]] = -1; + gpuInputDimensions[index] = input_dims[indices[i]]; + gpuOutputDimensions[index] = dimensions[indices[i]]; + } + + int written = static_cast(Layout) == static_cast(ColMajor) ? NumKernelDims : 0; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + gpuInputDimensions[written] = input_dims[i]; + gpuOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_gpuInputStrides[i] = m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1]; + m_gpuOutputStrides[i] = m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1]; + } else { + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (static_cast(i + 1) < offset) { + m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; + m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; + } else { + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; + } + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_gpuInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_gpuInputStrides[d]; + } + if (NumKernelDims < NumDims) { + inputIndex += p * m_inputStrides[NumKernelDims]; + } + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_gpuInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_gpuInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; + } + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_gpuOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; + } + if (NumKernelDims < NumDims) { + outputIndex += p * m_outputStrides[NumKernelDims]; + } + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_gpuOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; + } + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + k * m_inputStrides[offset + 2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + k * m_outputStrides[offset + 2]; + } + + private: + static constexpr int NumDims = internal::array_size::value; + array m_inputStrides; + array m_outputStrides; + array m_gpuInputStrides; + array m_gpuOutputStrides; +}; + +template +struct traits > { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, typename traits::Index>::type + Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = traits::Layout; + typedef std::conditional_t::val, + typename traits::PointerType, typename traits::PointerType> + PointerType; + + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorConvolutionOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorConvolutionOp type; +}; + +} // end namespace internal + +template +class TensorConvolutionOp + : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, + const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t& inputExpression() + const { + return m_input_xpr; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t& kernelExpression() + const { + return m_kernel_xpr; + } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + +template +struct TensorEvaluator, Device> { + typedef TensorConvolutionOp XprType; + + static constexpr int NumDims = + internal::array_size::Dimensions>::value; + static constexpr int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = + int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), + m_kernelImpl(op.kernelExpression(), device), + m_kernelArg(op.kernelExpression()), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + CoeffReturnType result = CoeffReturnType(0); + convolve(firstInput(index), 0, NumKernelDims - 1, result); + return result; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { + Index indices[2] = {index, index + PacketSize - 1}; + Index startInputs[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1] - startInputs[0] == PacketSize - 1) { + PacketReturnType result = internal::pset1(0); + convolvePacket(startInputs[0], 0, NumKernelDims - 1, result); + return result; + } else { + EIGEN_ALIGN_MAX Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims - 1, data[0]); + for (int i = 1; i < PacketSize - 1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index + i), 0, NumKernelDims - 1, data[i]); + } + data[PacketSize - 1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims - 1, data[PacketSize - 1]); + return internal::pload(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex - 1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex - 1, accum); + } else { + accum = internal::pmadd(m_inputImpl.template packet(input), + internal::pset1(m_kernel[kernel]), accum); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool Vectorize = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + array m_inputStride; + array m_outputStride; + + array m_indexStride; + array m_kernelStride; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device EIGEN_DEVICE_REF m_device; +}; + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + +template +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int /*kernelSize*/) const { return StaticKernelSize; } +}; +template <> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int kernelSize) const { return kernelSize; } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D( + InputEvaluator eval, const internal::IndexMapper indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, + float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else + extern __shared__ float s[]; +#endif + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = blockIdx.y * blockDim.y; + const int plane_stride = blockDim.y * gridDim.y; + + for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.y * num_x_input; +#pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + +#pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; +#pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D( + InputEvaluator eval, const internal::IndexMapper indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, + const int kernelSizeX, const int kernelSizeY, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else + extern __shared__ float s[]; +#endif + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = blockIdx.z * blockDim.z; + const int plane_stride = blockDim.z * gridDim.z; + + for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.z * num_y_input; + +// Load inputs to shared memory +#pragma unroll + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); +#pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = + plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + +#pragma unroll + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { +#pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; +#pragma unroll + for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); +#pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = + plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( + InputEvaluator eval, const internal::IndexMapper indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, + const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else + extern __shared__ float s[]; +#endif + + // Load inputs to shared memory + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + first_x, j + first_y, k + first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * + kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset( + i + first_x, j + first_y, k + first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + +template +struct TensorEvaluator, GpuDevice> { + typedef TensorConvolutionOp XprType; + + static constexpr int NumDims = + internal::array_size::Dimensions>::value; + static constexpr int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = + TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), + m_kernelImpl(op.kernelExpression(), device), + m_kernelArg(op.kernelExpression()), + m_indices(op.indices()), + m_buf(NULL), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static constexpr int PacketSize = internal::unpacket_traits::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator::Dimensions InputDims; + + const int maxSharedMem = m_device.sharedMemPerBlock(); + const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumGpuMultiProcessors(); + const int warpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + int maxX; + dim3 block_size; + + const int single_stride_dim = + static_cast(Layout) == static_cast(ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = numext::mini(inner_dim, numX); + const int maxP = numext::mini(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = numext::mini(maxThreadsPerBlock, maxX); + block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); + } else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); + const int maxP = numext::mini(inner_dim, numP); + maxX = numext::mini(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = numext::mini(warpSize, maxX); + block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + gpu_assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, numext::mini(num_y_blocks, ceil(numP, block_size.y))); + + // cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " + // num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " + // << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices{m_indices[0]}; + const array kernel_dims{m_kernelImpl.dimensions()[0]}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size) { + case 4: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, + numX, maxX, 4, data); + break; + } + case 7: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, + numX, maxX, 7, data); + break; + } + default: { + LAUNCH_GPU_KERNEL( + (EigenConvolutionKernel1D, Index, InputDims, Dynamic>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, + kernel_size, data); + } + } + break; + } + + case 2: { + const int idxX = static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; + const int idxY = static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numP = dimensions().TotalSize() / (numX * numY); + + const float scaling_factor = + sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = numext::mini(inner_dim, numX); + const int maxY = + numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = numext::mini( + maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = numext::mini(1024, maxX); + block_size.y = numext::mini(1024 / block_size.x, maxY); + block_size.z = numext::mini(1024 / (block_size.x * block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + gpu_assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini(num_z_blocks, ceil(numP, block_size.z))); + + // cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " + // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << + // " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " + // shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices{m_indices[idxX], m_indices[idxY]}; + const array kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + LAUNCH_GPU_KERNEL( + (EigenConvolutionKernel2D, Index, InputDims, 4, 7>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, + numY, maxY, 4, 7, data); + break; + } + default: { + LAUNCH_GPU_KERNEL( + (EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, + numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + LAUNCH_GPU_KERNEL( + (EigenConvolutionKernel2D, Index, InputDims, 7, 4>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, + numY, maxY, 7, 4, data); + break; + } + default: { + LAUNCH_GPU_KERNEL( + (EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, + numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, + Dynamic, Dynamic>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, + numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + break; + } + + case 3: { + const int idxX = static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; + const int idxY = static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; + const int idxZ = static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; + const int numP = dimensions().TotalSize() / (numX * numY * numZ); + + const int maxX = numext::mini( + 128, numext::mini(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, + numX)); + const int maxY = numext::mini( + 128, numext::mini( + maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, + numY)); + const int maxZ = numext::mini( + 128, numext::mini( + maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - + kernel_size_z + 1, + numZ)); + + dim3 block_size; + block_size.x = numext::mini(32, maxX); + block_size.y = numext::mini(32, maxY); + block_size.z = numext::mini(1024 / (block_size.x * block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = + (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + gpu_assert(shared_mem <= maxSharedMem); + + // cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " + // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << + // " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << + // endl; + const array indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}; + const array kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, + maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + + private: + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h new file mode 100644 index 00000000000..88ae9f4bb5b --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -0,0 +1,546 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ + +enum class convolution_type { CONV1D, CONV2D, CONV3D }; +template +struct EigenConvolutionKernel; +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const size_t kernelSize; + const cl::sycl::range<2> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const size_t kernelSize_, const cl::sycl::range<2> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernelSize(kernelSize_), + input_range(input_range_) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const { + return (boolean_check[0] && boolean_check[1]); + } + void operator()(cl::sycl::nd_item<2> itemID) const { + auto buffer_ptr = buffer_acc; + auto kernel_ptr = kernel_filter; + // the required row to be calculated for the for each plane in shered memory + const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); + const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; + const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0]; + const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1)); + /// fill the shared memory + for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) { + const size_t local_index = i + plane_kernel_offset; + const size_t tensor_index = + plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset); + + local_acc[local_index] = + (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1]) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution // output start x + const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]); + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + const size_t index = plane_kernel_offset + itemID.get_local_id(0); + for (size_t k = 0; k < kernelSize; ++k) { + result += (local_acc[k + index] * kernel_ptr[k]); + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<2> kernel_size; + const cl::sycl::range<3> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + + void operator()(cl::sycl::nd_item<3> itemID) const { + auto buffer_ptr = buffer_acc; + auto kernel_ptr = kernel_filter; + // the required row to be calculated for the for each plane in shered memory + const auto num_input = cl::sycl::range<2>{ + (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; + + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2)); + const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1]; + + const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + // fill the local memory + bool in_range_dim2 = itemID.get_global_id(2) < input_range[2]; + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset); + bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + const size_t local_index = i + local_input_offset; + const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1]); + local_acc[local_index] = + (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && in_range_dim1 && in_range_dim2) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // output offset start for each thread + const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + + for (size_t j = 0; j < kernel_size[1]; j++) { + size_t kernel_offset = kernel_size[0] * j; + const size_t index = + (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0); + for (size_t i = 0; i < kernel_size[0]; i++) { + result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]); + } + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0], + itemID.get_local_id(1) + output_offset[1]); + + buffer_ptr[tensor_index] = result; + } + } +}; + +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<3> kernel_size; + const cl::sycl::range<3> input_range; + const size_t numP; + + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_, + const size_t numP_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_), + numP(numP_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + void operator()(cl::sycl::nd_item<3> itemID) const { + auto buffer_ptr = buffer_acc; + auto kernel_ptr = kernel_filter; + const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; + + const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; + + const auto output_offset = + cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()}; + + for (size_t p = 0; p < numP; p++) { + /// fill the shared memory + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) { + size_t local_index_dim2 = num_input[0] * num_input[1] * k; + bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1)); + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1)); + size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2; + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1)); + const size_t local_index = local_index_dim1 + i; + const size_t tensor_index = + plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1], k + input_offset[2]); + local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0); + } + } + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + for (size_t k = 0; k < kernel_size[2]; k++) { + for (size_t j = 0; j < kernel_size[1]; j++) { + for (size_t i = 0; i < kernel_size[0]; i++) { + const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k); + const size_t local_index = + ((i + itemID.get_local_id(0)) + + num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2)))); + + result += (local_acc[local_index] * kernel_ptr[kernel_index]); + } + } + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]); + buffer_ptr[tensor_index] = result; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + } +}; + +template +struct TensorEvaluator, Eigen::SyclDevice> { + typedef TensorConvolutionOp XprType; + + static constexpr int NumDims = + internal::array_size::Dimensions>::value; + static constexpr int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + typedef const Eigen::SyclDevice Device; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory KernelStorage; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device) + : m_inputImpl(op.inputExpression(), device), + m_kernelArg(op.kernelExpression()), + m_kernelImpl(op.kernelExpression(), device), + m_indices(op.indices()), + m_buf(NULL), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions &input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions &kernel_dims = + m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (EvaluatorPointerType)m_device.get( + (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate_temp(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate_temp(m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + typename KernelStorage::Type in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz)); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(m_device.get(local), m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + m_kernel = local; + m_local_kernel = true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const { + typedef TensorEvaluator InputEvaluator; + typedef typename InputEvaluator::Dimensions InputDims; + switch (NumKernelDims) { + case 1: { + const size_t numX = dimensions()[m_indices[0]]; + const size_t numP = dimensions().TotalSize() / numX; + const auto input_dim = std::array{numX, numP}; + auto global_range = cl::sycl::range<2>{1, 1}; + auto local_range = cl::sycl::range<2>{1, 1}; + const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); + + m_device.parallel_for_setup(input_dim, global_range, local_range); + const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]); + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[0]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + typedef EigenConvolutionKernel + ConvKernel; + + m_device + .template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])) + .wait(); + break; + } + + case 2: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0}; + auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]]}; + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numP = dimensions().TotalSize() / (numX * numY); + auto input_dim = std::array{numX, numY, numP}; + + auto global_range = cl::sycl::range<3>{1, 1, 1}; + auto local_range = cl::sycl::range<3>{1, 1, 1}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + + const size_t local_memory_size = + (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2]; + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}}; + const array kernel_dims{ + {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + typedef EigenConvolutionKernel + ConvKernel; + m_device + .template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}) + .wait(); + break; + } + + case 3: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 2, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0}; + + auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]], + (size_t)m_kernelImpl.dimensions()[kernel_index[2]]}; + + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numZ = dimensions()[m_indices[kernel_index[2]]]; + auto input_dim = std::array{numX, numY, numZ}; + const size_t numP = dimensions().TotalSize() / (numX * numY * numZ); + + const array indices{ + {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]], + m_kernelImpl.dimensions()[kernel_index[1]], + m_kernelImpl.dimensions()[kernel_index[2]]}}; + + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + auto global_range = cl::sycl::range<3>{1, 1, 1}; + auto local_range = cl::sycl::range<3>{1, 1, 1}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + auto local_memory_range = (local_range + kernel_size - 1); + const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2]; + + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + typedef EigenConvolutionKernel + ConvKernel; + m_device + .template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP) + .wait(); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_buf != NULL); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { + eigen_assert(m_buf != NULL); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator &operator=(const TensorEvaluator &); + TensorEvaluator m_inputImpl; + KernelArgType m_kernelArg; + TensorEvaluator m_kernelImpl; + Indices m_indices; + Dimensions m_dimensions; + EvaluatorPointerType m_buf; + typename KernelStorage::Type m_kernel; + bool m_local_kernel; + const Eigen::SyclDevice EIGEN_DEVICE_REF m_device; +}; // namespace Eigen + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h new file mode 100644 index 00000000000..c0f13337a60 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -0,0 +1,190 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H +#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief A cost model used to limit the number of threads used for evaluating + * tensor expression. + * + */ + +// Class storing the cost of evaluating a tensor expression in terms of the +// estimated number of operand bytes loads, bytes stored, and compute cycles. +class TensorOpCost { + public: + // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple + // model based on minimal reciprocal throughput numbers from Intel or + // Agner Fog's tables would be better than what is there now. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { + return internal::functor_traits >::Cost; + } + + EIGEN_DEVICE_FUNC TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} + EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) + : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(compute_cycles) {} + + EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, + double packet_size) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) { + eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { return bytes_loaded_; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { return bytes_stored_; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { return compute_cycles_; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost, + double compute_cost) const { + return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_; + } + + // Drop memory access component. Intended for cases when memory accesses are + // sequential or are completely masked by computations. + EIGEN_DEVICE_FUNC void dropMemoryCost() { + bytes_loaded_ = 0; + bytes_stored_ = 0; + } + + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost& rhs) const { + double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); + } + + // TODO(rmlarsen): Define max in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost& rhs) const { + double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(const TensorOpCost& rhs) { + bytes_loaded_ += rhs.bytes_loaded(); + bytes_stored_ += rhs.bytes_stored(); + compute_cycles_ += rhs.compute_cycles(); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { + bytes_loaded_ *= rhs; + bytes_stored_ *= rhs; + compute_cycles_ *= rhs; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost& rhs) { + lhs += rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs) { + lhs *= rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs) { + rhs *= lhs; + return rhs; + } + + friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { + return os << "[bytes_loaded = " << tc.bytes_loaded() << ", bytes_stored = " << tc.bytes_stored() + << ", compute_cycles = " << tc.compute_cycles() << "]"; + } + + private: + double bytes_loaded_; + double bytes_stored_; + double compute_cycles_; +}; + +// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads +// in [1:max_threads] instead of just switching multi-threading off for small +// work units. +template +class TensorCostModel { + public: + // Scaling from Eigen compute cost to device cycles. + static const int kDeviceCyclesPerComputeCycle = 1; + + // Costs in device cycles. + static const int kStartupCycles = 100000; + static const int kPerThreadCycles = 100000; + static const int kTaskSize = 40000; + + // Returns the number of threads in [1:max_threads] to use for + // evaluating an expression with the given output size and cost per + // coefficient. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost& cost_per_coeff, + int max_threads) { + double cost = totalCost(output_size, cost_per_coeff); + double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + // Make sure we don't invoke undefined behavior when we convert to an int. + threads = numext::mini(threads, GenericNumTraits::highest()); + return numext::mini(max_threads, numext::maxi(1, static_cast(threads))); + } + + // taskSize assesses parallel task size. + // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task + // granularity needs to be increased to mitigate parallelization overheads. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost& cost_per_coeff) { + return totalCost(output_size, cost_per_coeff) / kTaskSize; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size, + const TensorOpCost& cost_per_coeff) { + // Cost of memory fetches from L2 cache. 64 is typical cache line size. + // 11 is L2 cache latency on Haswell. + // We don't know whether data is in L1, L2 or L3. But we are most interested + // in single-threaded computational time around 100us-10ms (smaller time + // is too small for parallelization, larger time is not interesting + // either because we are probably using all available threads already). + // And for the target time range, L2 seems to be what matters. Data set + // fitting into L1 is too small to take noticeable time. Data set fitting + // only into L3 presumably will take more than 10ms to load and process. + const double kLoadCycles = 1.0 / 64 * 11; + const double kStoreCycles = 1.0 / 64 * 11; + // Scaling from Eigen compute cost to device cycles. + return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle); + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h new file mode 100644 index 00000000000..c8e1df73394 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -0,0 +1,309 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H +#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorCustomUnaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template +struct traits > { + typedef typename XprType::Scalar Scalar; + typedef typename XprType::StorageKind StorageKind; + typedef typename XprType::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = traits::Layout; + typedef typename traits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorCustomUnaryOp EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef TensorCustomUnaryOp type; +}; + +} // end namespace internal + +template +class TensorCustomUnaryOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func) + : m_expr(expr), m_func(func) {} + + EIGEN_DEVICE_FUNC const CustomUnaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_expr; } + + protected: + typename XprType::Nested m_expr; + const CustomUnaryFunc m_func; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorCustomUnaryOp ArgType; + typedef typename internal::traits::Index Index; + static constexpr int NumDims = internal::traits::NumDimensions; + typedef DSizes Dimensions; + typedef std::remove_const_t Scalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) { + m_dimensions = op.func().dimensions(op.expression()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast( + m_device.get((CoeffReturnType*)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)))); + evalTo(m_result); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + + protected: + void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); + m_op.func().eval(m_op.expression(), result, m_device); + } + + Dimensions m_dimensions; + const ArgType m_op; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; +}; + +/** \class TensorCustomBinaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template +struct traits > { + typedef typename internal::promote_storage_type::ret Scalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = traits::Layout; + typedef std::conditional_t::val, + typename traits::PointerType, typename traits::PointerType> + PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorCustomBinaryOp& type; +}; + +template +struct nested > { + typedef TensorCustomBinaryOp type; +}; + +} // end namespace internal + +template +class TensorCustomBinaryOp + : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::traits::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, + const CustomBinaryFunc& func) + + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {} + + EIGEN_DEVICE_FUNC const CustomBinaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& lhsExpression() const { + return m_lhs_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& rhsExpression() const { + return m_rhs_xpr; + } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const CustomBinaryFunc m_func; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorCustomBinaryOp XprType; + typedef typename internal::traits::Index Index; + static constexpr int NumDims = internal::traits::NumDimensions; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) { + m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast( + m_device.get((CoeffReturnType*)m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)))); + evalTo(m_result); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_result != NULL) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + + protected: + void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); + m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); + } + + Dimensions m_dimensions; + const XprType m_op; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h new file mode 100644 index 00000000000..9898fdcfbb5 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -0,0 +1,139 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its argument + * on the specified computing 'device' (GPU, thread pool, ...) + * + * Example: + * C.device(EIGEN_GPU) = A + B; + * + * Todo: operator *= and /=. + */ + +template +class TensorDevice { + public: + TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice) + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + template + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> + Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; +}; + +/** \class TensorAsyncDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its + * argument asynchronously on the specified device. Currently only + * ThreadPoolDevice implements proper asynchronous execution, while the default + * and GPU devices just run the expression synchronously and call m_done() on + * completion.. + * + * Example: + * auto done = []() { ... expression evaluation done ... }; + * C.device(thread_pool_device, std::move(done)) = A + B; + */ + +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const DeviceType& device, ExpressionType& expression, DoneCallback done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorExecutor Executor; + + Assign assign(m_expression, other); + Executor::run(assign, m_device); + m_done(); + + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; + DoneCallback m_done; +}; + +#ifdef EIGEN_USE_THREADS +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression, DoneCallback done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorAsyncExecutor Executor; + + // WARNING: After assignment 'm_done' callback will be in undefined state. + Assign assign(m_expression, other); + Executor::runAsync(assign, m_device, std::move(m_done)); + + return *this; + } + + protected: + const ThreadPoolDevice& m_device; + ExpressionType& m_expression; + DoneCallback m_done; +}; +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h new file mode 100644 index 00000000000..c2c8ed002bf --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -0,0 +1,7 @@ + +#if defined(__clang__) || defined(__GNUC__) +#warning \ + "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file" +#endif + +#include "TensorDeviceGpu.h" diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h new file mode 100644 index 00000000000..eaaf3321500 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -0,0 +1,113 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +// Default device for the machine (typically a single cpu core) +struct DefaultDevice { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return allocate(num_bytes); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { +#ifdef EIGEN_GPU_COMPILE_PHASE + // std::fill is not a device function, so resort to simple loop. + for (T* it = begin; it != end; ++it) { + *it = value; + } +#else + std::fill(begin, end, value); +#endif + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) + // Running on the host CPU + return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 64; +#else + // Running on a CUDA device + return 32; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) + // Running on the host CPU + return l1CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 48 * 1024; // FIXME : update this number for HIP +#else + // Running on a CUDA device, return the amount of shared memory available. + return 48 * 1024; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) + // Running single threaded on the host CPU + return l3CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return firstLevelCacheSize(); // FIXME : update this number for HIP +#else + // Running on a CUDA device + return firstLevelCacheSize(); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { + // Nothing. Default device operations are synchronous. + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) + // Running single threaded on the host CPU + // Should return an enum that encodes the ISA supported by the CPU + return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + // return 1 as major for HIP + return 1; +#else + // Running on a CUDA device + return EIGEN_CUDA_ARCH / 100; +#endif + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h new file mode 100644 index 00000000000..4c24bc1f0a9 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -0,0 +1,395 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H + +// This header file container defines fo gpu* macros which will resolve to +// their equivalent hip* or cuda* versions depending on the compiler in use +// A separate header (included at the end of this file) will undefine all +#include "TensorGpuHipCudaDefines.h" + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +static const int kGpuScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// HIP / CUDA streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const gpuStream_t& stream() const = 0; + virtual const gpuDeviceProp_t& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +class GpuDeviceProperties { + public: + GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {} + + ~GpuDeviceProperties() { + if (device_properties_) { + delete[] device_properties_; + } + } + + EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { return device_properties_[device]; } + + EIGEN_STRONG_INLINE bool isInitialized() const { return initialized_; } + + void initialize() { + if (!initialized_) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. + if (first_.exchange(false)) { + // We're the first thread to reach this point. + int num_devices; + gpuError_t status = gpuGetDeviceCount(&num_devices); + if (status != gpuSuccess) { + std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl; + gpu_assert(status == gpuSuccess); + } + device_properties_ = new gpuDeviceProp_t[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = gpuGetDeviceProperties(&device_properties_[i], i); + if (status != gpuSuccess) { + std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl; + gpu_assert(status == gpuSuccess); + } + } + + std::atomic_thread_fence(std::memory_order_release); + initialized_ = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!initialized_) { + std::atomic_thread_fence(std::memory_order_acquire); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + } + } + } + + private: + volatile bool initialized_; + std::atomic first_; + gpuDeviceProp_t* device_properties_; +}; + +EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { + static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties(); + if (!deviceProperties->isInitialized()) { + deviceProperties->initialize(); + } + return *deviceProperties; +} + +EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) { + return GetGpuDeviceProperties().get(device); +} + +static const gpuStream_t default_stream = gpuStreamDefault; + +class GpuStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + gpuError_t status = gpuGetDevice(&device_); + if (status != gpuSuccess) { + std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl; + gpu_assert(status == gpuSuccess); + } + } + // Use the default stream on the specified device + GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {} + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + GpuStreamDevice(const gpuStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + gpuError_t status = gpuGetDevice(&device_); + if (status != gpuSuccess) { + std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl; + gpu_assert(status == gpuSuccess); + } + } else { + int num_devices; + gpuError_t err = gpuGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + gpu_assert(device < num_devices); + device_ = device; + } + } + + virtual ~GpuStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const gpuStream_t& stream() const { return *stream_; } + const gpuDeviceProp_t& deviceProperties() const { return GetGpuDeviceProperties(device_); } + virtual void* allocate(size_t num_bytes) const { + gpuError_t err = gpuSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + void* result; + err = gpuMalloc(&result, num_bytes); + gpu_assert(err == gpuSuccess); + gpu_assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + gpuError_t err = gpuSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + gpu_assert(buffer != NULL); + err = gpuFree(buffer); + gpu_assert(err == gpuSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kGpuScratchSize; + semaphore_ = reinterpret_cast(scratch); + gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + return semaphore_; + } + + private: + const gpuStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { eigen_assert(stream); } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const gpuStream_t& stream() const { return stream_->stream(); } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return stream_->allocate(num_bytes); } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { stream_->deallocate(buffer); } + + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return stream_->allocate(num_bytes); } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { stream_->deallocate(buffer); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { return stream_->semaphore(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n); + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(buffer) + EIGEN_UNUSED_VARIABLE(c) + EIGEN_UNUSED_VARIABLE(n) + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + template + EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + const size_t count = end - begin; + // Split value into bytes and run memset with stride. + const int value_size = sizeof(value); + char* buffer = (char*)begin; + char* value_bytes = (char*)(&value); + gpuError_t err; + EIGEN_UNUSED_VARIABLE(err) + + // If all value bytes are equal, then a single memset can be much faster. + bool use_single_memset = true; + for (int i = 1; i < value_size; ++i) { + if (value_bytes[i] != value_bytes[0]) { + use_single_memset = false; + } + } + + if (use_single_memset) { + err = gpuMemsetAsync(buffer, value_bytes[0], count * sizeof(T), stream_->stream()); + gpu_assert(err == gpuSuccess); + } else { + for (int b = 0; b < value_size; ++b) { + err = gpuMemset2DAsync(buffer + b, value_size, value_bytes[b], 1, count, stream_->stream()); + gpu_assert(err == gpuSuccess); + } + } +#else + EIGEN_UNUSED_VARIABLE(begin) + EIGEN_UNUSED_VARIABLE(end) + EIGEN_UNUSED_VARIABLE(value) + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48 * 1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on hip/cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuStreamSynchronize(stream_->stream()); + if (err != gpuSuccess) { + std::cerr << "Error detected in GPU stream: " << gpuGetErrorString(err) << std::endl; + gpu_assert(err == gpuSuccess); + } +#else + gpu_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { return stream_->deviceProperties().multiProcessorCount; } + EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { return stream_->deviceProperties().maxThreadsPerBlock; } + EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return static_cast(stream_->deviceProperties().sharedMemPerBlock); + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { return stream_->deviceProperties().minor; } + + EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; } + + // This function checks if the GPU runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#ifdef EIGEN_GPUCC + gpuError_t error = gpuStreamQuery(stream_->stream()); + return (error == gpuSuccess) || (error == gpuErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#if defined(EIGEN_HIPCC) + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \ + gpu_assert(hipGetLastError() == hipSuccess); + +#else + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>(__VA_ARGS__); \ + gpu_assert(cudaGetLastError() == cudaSuccess); + +#endif + +// FIXME: Should be device and kernel specific. +#ifdef EIGEN_GPUCC +static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t status = gpuDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + gpu_assert(status == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} +#endif + +} // end namespace Eigen + +// undefine all the gpu* macros we defined at the beginning of the file +#include "TensorGpuHipCudaUndefines.h" + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h new file mode 100644 index 00000000000..b291fed66a5 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -0,0 +1,567 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +#include + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace TensorSycl { +namespace internal { + +/// Cache all the device information needed +struct SyclDeviceInfo { + SyclDeviceInfo(cl::sycl::queue queue) + : local_mem_type(queue.get_device().template get_info()), + max_work_item_sizes(queue.get_device().template get_info>()), + max_mem_alloc_size(queue.get_device().template get_info()), + max_compute_units(queue.get_device().template get_info()), + max_work_group_size(queue.get_device().template get_info()), + local_mem_size(queue.get_device().template get_info()), + platform_name(queue.get_device().get_platform().template get_info()), + device_name(queue.get_device().template get_info()), + device_vendor(queue.get_device().template get_info()) {} + + cl::sycl::info::local_mem_type local_mem_type; + cl::sycl::id<3> max_work_item_sizes; + unsigned long max_mem_alloc_size; + unsigned long max_compute_units; + unsigned long max_work_group_size; + size_t local_mem_size; + std::string platform_name; + std::string device_name; + std::string device_vendor; +}; + +} // end namespace internal +} // end namespace TensorSycl + +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. +EIGEN_STRONG_INLINE auto get_sycl_supported_devices() -> decltype(cl::sycl::device::get_devices()) { +#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR + return {cl::sycl::device(cl::sycl::default_selector())}; +#else + std::vector supported_devices; + auto platform_list = cl::sycl::platform::get_platforms(); + for (const auto &platform : platform_list) { + auto device_list = platform.get_devices(); + auto platform_name = platform.template get_info(); + std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower); + for (const auto &device : device_list) { + auto vendor = device.template get_info(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + bool unsupported_condition = (device.is_cpu() && platform_name.find("amd") != std::string::npos && + vendor.find("apu") == std::string::npos) || + (platform_name.find("experimental") != std::string::npos) || device.is_host(); + if (!unsupported_condition) { + supported_devices.push_back(device); + } + } + } + return supported_devices; +#endif +} + +class QueueInterface { + public: + /// Creating device by using cl::sycl::selector or cl::sycl::device. + template + explicit QueueInterface(const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, + unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}}, + m_thread_pool(num_threads), + m_device_info(m_queue) {} + + template + explicit QueueInterface(const DeviceOrSelector &dev_or_sel, + unsigned num_threads = std::thread::hardware_concurrency()) + : QueueInterface( + dev_or_sel, [this](cl::sycl::exception_list l) { this->exception_caught_ = this->sycl_async_handler(l); }, + num_threads) {} + + explicit QueueInterface(const cl::sycl::queue &q, unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue(q), m_thread_pool(num_threads), m_device_info(m_queue) {} + + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + return (void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES, num_bytes, m_queue); +#else + return (void *)cl::sycl::malloc_device(num_bytes, m_queue); +#endif + } + + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { + return (void *)cl::sycl::malloc_device(num_bytes, m_queue); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const { + return data; + } + + EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { deallocate(p); } + + EIGEN_STRONG_INLINE void deallocate_temp(const void *p) const { deallocate_temp(const_cast(p)); } + + EIGEN_STRONG_INLINE void deallocate(void *p) const { cl::sycl::free(p, m_queue); } + + /// The memcpyHostToDevice is used to copy the data from host to device + /// The destination pointer could be deleted before the copy happened which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyHostToDevice(void *dst, const void *src, size_t n, + std::function callback) const { + auto e = m_queue.memcpy(dst, src, n); + synchronize_and_callback(e, callback); + } + + /// The memcpyDeviceToHost is used to copy the data from device to host. + /// The source pointer could be deleted before the copy happened which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const void *src, size_t n, + std::function callback) const { + if (n == 0) { + if (callback) callback(); + return; + } + auto e = m_queue.memcpy(dst, src, n); + synchronize_and_callback(e, callback); + } + + /// The memcpy function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + if (n == 0) { + return; + } + m_queue.memcpy(dst, src, n).wait(); + } + + /// the memset function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + if (n == 0) { + return; + } + m_queue.memset(data, c, n).wait(); + } + + template + EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const { + if (begin == end) { + return; + } + const size_t count = end - begin; + m_queue.fill(begin, value, count).wait(); + } + + template + EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(const Lhs &lhs, const Rhs &rhs, OutPtr outptr, + Range thread_range, Index scratchSize, T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for(thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); + }; + + return m_queue.submit(kernel_functor); + } + + template + EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(const InPtr &inptr, OutPtr &outptr, Range thread_range, + Index scratchSize, T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, outptr, var...)); + }; + return m_queue.submit(kernel_functor); + } + + template + EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(const InPtr &inptr, Range thread_range, Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...)); + }; + + return m_queue.submit(kernel_functor); + } + + EIGEN_STRONG_INLINE void synchronize() const { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + tileSize = static_cast(getNearestPowerOfTwoWorkGroupSize()); + tileSize = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(tileSize)); + rng = n; + if (rng == 0) rng = static_cast(1); + GRange = rng; + if (tileSize > GRange) + tileSize = GRange; + else if (GRange > tileSize) { + Index xMode = static_cast(GRange % tileSize); + if (xMode != 0) GRange += static_cast(tileSize - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + std::array input_range = input_dim; + Index max_workgroup_Size = static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + local_range[1] = static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) global_range[1] += static_cast(local_range[1] - xMode); + } + local_range[0] = static_cast(max_workgroup_Size / local_range[1]); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) global_range[0] += static_cast(local_range[0] - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + std::array input_range = input_dim; + Index max_workgroup_Size = static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + local_range[2] = static_cast(std::pow(2, static_cast(pow_of_2 / 3))); + input_range[2] = input_dim[2]; + if (input_range[2] == 0) input_range[1] = static_cast(1); + global_range[2] = input_range[2]; + if (local_range[2] > global_range[2]) + local_range[2] = global_range[2]; + else if (global_range[2] > local_range[2]) { + Index xMode = static_cast(global_range[2] % local_range[2]); + if (xMode != 0) global_range[2] += static_cast(local_range[2] - xMode); + } + pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size / local_range[2]))); + local_range[1] = static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) global_range[1] += static_cast(local_range[1] - xMode); + } + local_range[0] = static_cast(max_workgroup_Size / (local_range[1] * local_range[2])); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) global_range[0] += static_cast(local_range[0] - xMode); + } + } + + EIGEN_STRONG_INLINE bool has_local_memory() const { +#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + return false; +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + return true; +#else + return m_device_info.local_mem_type == cl::sycl::info::local_mem_type::local; +#endif + } + + EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { return m_device_info.max_mem_alloc_size; } + + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { return m_device_info.max_compute_units; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { return m_device_info.max_work_group_size; } + + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { return m_device_info.max_work_item_sizes; } + + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL does not have such a concept + return 2; + } + + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return m_device_info.local_mem_size; } + + // This function returns the nearest power of 2 Work-group size which is <= + // maximum device workgroup size. + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return getPowerOfTwo(m_device_info.max_work_group_size, false); + } + + EIGEN_STRONG_INLINE std::string getPlatformName() const { return m_device_info.platform_name; } + + EIGEN_STRONG_INLINE std::string getDeviceName() const { return m_device_info.device_name; } + + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return m_device_info.device_vendor; } + + // This function returns the nearest power of 2 + // if roundup is true returns result>=wgsize + // else it return result <= wgsize + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { + if (roundUp) --wGSize; + wGSize |= (wGSize >> 1); + wGSize |= (wGSize >> 2); + wGSize |= (wGSize >> 4); + wGSize |= (wGSize >> 8); + wGSize |= (wGSize >> 16); +#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 + wGSize |= (wGSize >> 32); +#endif + return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); + } + + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { + if (!exception_caught_) { + synchronize(); + } + return !exception_caught_; + } + + protected: + void synchronize_and_callback(cl::sycl::event e, const std::function &callback) const { + if (callback) { + auto callback_ = [=]() { +#ifdef EIGEN_EXCEPTIONS + cl::sycl::event(e).wait_and_throw(); +#else + cl::sycl::event(e).wait(); +#endif + callback(); + }; + m_thread_pool.Schedule(std::move(callback_)); + } else { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + } + + bool sycl_async_handler(cl::sycl::exception_list exceptions) const { + bool exception_caught = false; + for (const auto &e : exceptions) { + if (e) { + exception_caught = true; + EIGEN_THROW_X(e); + } + } + return exception_caught; + } + + /// class members: + bool exception_caught_ = false; + /// sycl queue + mutable cl::sycl::queue m_queue; + /// The thread pool is used to wait on events and call callbacks + /// asynchronously + mutable Eigen::ThreadPool m_thread_pool; + + const TensorSycl::internal::SyclDeviceInfo m_device_info; +}; + +struct SyclDeviceBase { + /// QueueInterface is not owned. it is the caller's responsibility to destroy + /// it + const QueueInterface *m_queue_stream; + explicit SyclDeviceBase(const QueueInterface *queue_stream) : m_queue_stream(queue_stream) {} + EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { return m_queue_stream; } +}; + +// Here is a sycl device struct which accept the sycl queue interface +// as an input +struct SyclDevice : public SyclDeviceBase { + explicit SyclDevice(const QueueInterface *queue_stream) : SyclDeviceBase(queue_stream) {} + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); + } + + /// allocate device memory + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return queue_stream()->allocate(num_bytes); } + + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { return queue_stream()->allocate_temp(num_bytes); } + + /// deallocate device memory + EIGEN_STRONG_INLINE void deallocate(void *p) const { queue_stream()->deallocate(p); } + + EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { queue_stream()->deallocate_temp(buffer); } + + EIGEN_STRONG_INLINE void deallocate_temp(const void *buffer) const { queue_stream()->deallocate_temp(buffer); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const { + return data; + } + + // some runtime conditions that can be applied here + EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + + /// memcpyHostToDevice + template + EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyHostToDevice(dst, src, n, callback); + } + /// memcpyDeviceToHost + template + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyDeviceToHost(dst, src, n, callback); + } + /// the memcpy function + template + EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + queue_stream()->memcpy(dst, src, n); + } + /// the memset function + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { queue_stream()->memset(data, c, n); } + /// the fill function + template + EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const { + queue_stream()->fill(begin, end, value); + } + /// returning the sycl queue + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return queue_stream()->sycl_queue(); } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on sycl devices. + return firstLevelCacheSize(); + } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return queue_stream()->getNumSyclMultiProcessors(); + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { return queue_stream()->maxSyclThreadsPerBlock(); } + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { return queue_stream()->maxWorkItemSizes(); } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL does not have such a concept + return queue_stream()->maxSyclThreadsPerMultiProcessor(); + } + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return queue_stream()->sharedMemPerBlock(); } + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); + } + + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { + return queue_stream()->getPowerOfTwo(val, roundUp); + } + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return queue_stream()->majorDeviceVersion(); } + + EIGEN_STRONG_INLINE void synchronize() const { queue_stream()->synchronize(); } + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } + + EIGEN_STRONG_INLINE bool has_local_memory() const { return queue_stream()->has_local_memory(); } + EIGEN_STRONG_INLINE long max_buffer_size() const { return queue_stream()->max_buffer_size(); } + EIGEN_STRONG_INLINE std::string getPlatformName() const { return queue_stream()->getPlatformName(); } + EIGEN_STRONG_INLINE std::string getDeviceName() const { return queue_stream()->getDeviceName(); } + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return queue_stream()->getDeviceVendor(); } + template + EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(T... var) const { + return queue_stream()->template binary_kernel_launcher(var...); + } + template + EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(T... var) const { + return queue_stream()->template unary_kernel_launcher(var...); + } + + template + EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(T... var) const { + return queue_stream()->template nullary_kernel_launcher(var...); + } +}; +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h new file mode 100644 index 00000000000..c95c8f22343 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -0,0 +1,376 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +// Runs an arbitrary function and then calls Notify() on the passed in +// Notification. +template +struct FunctionWrapperWithNotification { + static void run(Notification* n, Function f, Args... args) { + f(args...); + if (n) { + n->Notify(); + } + } +}; + +template +struct FunctionWrapperWithBarrier { + static void run(Barrier* b, Function f, Args... args) { + f(args...); + if (b) { + b->Notify(); + } + } +}; + +template +static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { + if (n) { + n->Wait(); + } +} + +// An abstract interface to a device specific memory allocator. +class Allocator { + public: + virtual ~Allocator() {} + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; +}; + +// Build a thread pool device on top the an existing pool of threads. +struct ThreadPoolDevice { + // The ownership of the thread pool remains with the caller. + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr) + : pool_(pool), num_threads_(num_cores), allocator_(allocator) {} + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return allocator_ ? allocator_->allocate(num_bytes) : internal::aligned_malloc(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + if (allocator_) { + allocator_->deallocate(buffer); + } else { + internal::aligned_free(buffer); + } + } + + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return allocate(num_bytes); } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifdef __ANDROID__ + ::memcpy(dst, src, n); +#else + // TODO(rmlarsen): Align blocks on cache lines. + // We have observed that going beyond 4 threads usually just wastes + // CPU cycles due to the threads competing for memory bandwidth, so we + // statically schedule at most 4 block copies here. + const size_t kMinBlockSize = 32768; + const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); + if (n <= kMinBlockSize || num_threads < 2) { + ::memcpy(dst, src, n); + } else { + const char* src_ptr = static_cast(src); + char* dst_ptr = static_cast(dst); + const size_t blocksize = (n + (num_threads - 1)) / num_threads; + Barrier barrier(static_cast(num_threads - 1)); + // Launch the last 3 blocks on worker threads. + for (size_t i = 1; i < num_threads; ++i) { + enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] { + ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, numext::mini(blocksize, n - (i * blocksize))); + }); + } + // Launch the first block on the main thread. + ::memcpy(dst_ptr, src_ptr, blocksize); + barrier.Wait(); + } +#endif + } + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); } + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); } + + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + + template + EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { + std::fill(begin, end, value); + } + + EIGEN_STRONG_INLINE int numThreads() const { return num_threads_; } + + // Number of theads available in the underlying thread pool. This number can + // be different from the value returned by numThreads(). + EIGEN_STRONG_INLINE int numThreadsInPool() const { return pool_->NumThreads(); } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return l1CacheSize(); } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // The l3 cache size is shared between all the cores. + return l3CacheSize() / num_threads_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { + // Nothing. Threadpool device operations are synchronous. + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + // Should return an enum that encodes the ISA supported by the CPU + return 1; + } + + template + EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { + Notification* n = new Notification(); + pool_->Schedule( + std::bind(&FunctionWrapperWithNotification::run, n, std::forward(f), args...)); + return n; + } + + template + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, Args&&... args) const { + pool_->Schedule( + std::bind(&FunctionWrapperWithBarrier::run, b, std::forward(f), args...)); + } + + template + EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { + if (sizeof...(args) > 0) { + pool_->Schedule(std::bind(std::forward(f), args...)); + } else { + pool_->Schedule(std::forward(f)); + } + } + + // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if + // called from one of the threads in pool_. Returns -1 otherwise. + EIGEN_STRONG_INLINE int currentThreadId() const { return pool_->CurrentThreadId(); } + + // WARNING: This function is synchronous and will block the calling thread. + // + // Synchronous parallelFor executes f with [0, n) arguments in parallel and + // waits for completion. F accepts a half-open interval [first, last). Block + // size is chosen based on the iteration cost and resulting parallel + // efficiency. If block_align is not nullptr, it is called to round up the + // block size. + void parallelFor(Index n, const TensorOpCost& cost, std::function block_align, + std::function f) const { + if (EIGEN_PREDICT_FALSE(n <= 0)) { + return; + // Compute small problems directly in the caller thread. + } else if (n == 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + return; + } + + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(static_cast(block.count)); + std::function handleRange; + handleRange = [this, block, &handleRange, &barrier, &f](Index firstIdx, Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); + lastIdx = midIdx; + } + // Single block or less, execute directly. + f(firstIdx, lastIdx); + barrier.Notify(); + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + handleRange(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); + } + + barrier.Wait(); + } + + // Convenience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, std::function f) const { + parallelFor(n, cost, nullptr, std::move(f)); + } + + // WARNING: This function is asynchronous and will not block the calling thread. + // + // Asynchronous parallelFor executes f with [0, n) arguments in parallel + // without waiting for completion. When the last block finished, it will call + // 'done' callback. F accepts a half-open interval [first, last). Block size + // is chosen based on the iteration cost and resulting parallel efficiency. If + // block_align is not nullptr, it is called to round up the block size. + void parallelForAsync(Index n, const TensorOpCost& cost, std::function block_align, + std::function f, std::function done) const { + // Compute small problems directly in the caller thread. + if (n <= 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + done(); + return; + } + + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + ParallelForAsyncContext* const ctx = new ParallelForAsyncContext(block.count, std::move(f), std::move(done)); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule([ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); + lastIdx = midIdx; + } + + // Single block or less, execute directly. + ctx->f(firstIdx, lastIdx); + + // Delete async context if it was the last block. + if (ctx->count.fetch_sub(1) == 1) delete ctx; + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + ctx->handle_range(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); }); + } + } + + // Convenience wrapper for parallelForAsync that does not align blocks. + void parallelForAsync(Index n, const TensorOpCost& cost, std::function f, + std::function done) const { + parallelForAsync(n, cost, nullptr, std::move(f), std::move(done)); + } + + // Thread pool accessor. + ThreadPoolInterface* getPool() const { return pool_; } + + // Allocator accessor. + Allocator* allocator() const { return allocator_; } + + private: + typedef TensorCostModel CostModel; + + // For parallelForAsync we must keep passed in closures on the heap, and + // delete them only after `done` callback finished. + struct ParallelForAsyncContext { + ParallelForAsyncContext(Index block_count, std::function block_f, + std::function done_callback) + : count(block_count), f(std::move(block_f)), done(std::move(done_callback)) {} + ~ParallelForAsyncContext() { done(); } + + std::atomic count; + std::function f; + std::function done; + + std::function handle_range; + }; + + struct ParallelForBlock { + Index size; // block size + Index count; // number of blocks + }; + + // Calculates block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate parallelization + // overheads; not too large to mitigate tail effect and potential load + // imbalance and we also want number of blocks to be evenly dividable across + // threads. + ParallelForBlock CalculateParallelForBlock(const Index n, const TensorOpCost& cost, + std::function block_align) const { + const double block_size_f = 1.0 / CostModel::taskSize(1, cost); + const Index max_oversharding_factor = 4; + Index block_size = numext::mini( + n, numext::maxi(numext::div_ceil(n, max_oversharding_factor * numThreads()), block_size_f)); + const Index max_block_size = numext::mini(n, 2 * block_size); + + if (block_align) { + Index new_block_size = block_align(block_size); + eigen_assert(new_block_size >= block_size); + block_size = numext::mini(n, new_block_size); + } + + Index block_count = numext::div_ceil(n, block_size); + + // Calculate parallel efficiency as fraction of total CPU time used for + // computations: + double max_efficiency = + static_cast(block_count) / (numext::div_ceil(block_count, numThreads()) * numThreads()); + + // Now try to increase block size up to max_block_size as long as it + // doesn't decrease parallel efficiency. + for (Index prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) { + // This is the next block size that divides size into a smaller number + // of blocks than the current block_size. + Index coarser_block_size = numext::div_ceil(n, prev_block_count - 1); + if (block_align) { + Index new_block_size = block_align(coarser_block_size); + eigen_assert(new_block_size >= coarser_block_size); + coarser_block_size = numext::mini(n, new_block_size); + } + if (coarser_block_size > max_block_size) { + break; // Reached max block size. Stop. + } + // Recalculate parallel efficiency. + const Index coarser_block_count = numext::div_ceil(n, coarser_block_size); + eigen_assert(coarser_block_count < prev_block_count); + prev_block_count = coarser_block_count; + const double coarser_efficiency = static_cast(coarser_block_count) / + (numext::div_ceil(coarser_block_count, numThreads()) * numThreads()); + if (coarser_efficiency + 0.01 >= max_efficiency) { + // Taking it. + block_size = coarser_block_size; + block_count = coarser_block_count; + if (max_efficiency < coarser_efficiency) { + max_efficiency = coarser_efficiency; + } + } + } + + return {block_size, block_count}; + } + + ThreadPoolInterface* pool_; + int num_threads_; + Allocator* allocator_; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h new file mode 100644 index 00000000000..315b556aefc --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h @@ -0,0 +1,119 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \internal + * + * \class TensorDimensionList + * \ingroup CXX11_Tensor_Module + * + * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n. + * + * \sa Tensor + */ + +template +struct DimensionList { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const Index operator[](const Index i) const { return i; } +}; + +namespace internal { + +template +struct array_size > { + static const size_t value = Rank; +}; +template +struct array_size > { + static const size_t value = Rank; +}; + +template +const Index array_get(DimensionList&) { + return n; +} +template +const Index array_get(const DimensionList&) { + return n; +} + +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { return true; } +}; +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { return true; } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { return true; } +}; +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { return true; } +}; + +template +struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { return true; } +}; +template +struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { return true; } +}; + +template +struct index_statically_eq_impl > { + static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i == value; } +}; +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i == value; } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i != value; } +}; +template +struct index_statically_ne_impl > { + static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i != value; } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i > value; } +}; +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i > value; } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i < value; } +}; +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i < value; } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h new file mode 100644 index 00000000000..06d4dfd30dd --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -0,0 +1,329 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \internal + * + * \class TensorDimensions + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode and store the dimensions of a Tensor. + * + * The Sizes class encodes as part of the type the number of dimensions and the + * sizes corresponding to each dimension. It uses no storage space since it is + * entirely known at compile time. + * The DSizes class is its dynamic sibling: the number of dimensions is known + * at compile time but the sizes are set during execution. + * + * \sa Tensor + */ + +// Boilerplate code +namespace internal { + +template +struct dget { + static const std::ptrdiff_t value = get::value; +}; + +template +struct fixed_size_tensor_index_linearization_helper { + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(array const& indices, + const Dimensions& dimensions) { + return array_get < RowMajor ? n - 1 + : (NumIndices - n) > (indices) + dget < RowMajor ? n - 1 + : (NumIndices - n), + Dimensions > ::value * fixed_size_tensor_index_linearization_helper::run( + indices, dimensions); + } +}; + +template +struct fixed_size_tensor_index_linearization_helper { + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(array const&, const Dimensions&) { + return 0; + } +}; + +template +struct fixed_size_tensor_index_extraction_helper { + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(const Index index, const Dimensions& dimensions) { + const Index mult = (index == n - 1) ? 1 : 0; + return array_get(dimensions) * mult + + fixed_size_tensor_index_extraction_helper::run(index, dimensions); + } +}; + +template +struct fixed_size_tensor_index_extraction_helper { + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(const Index, const Dimensions&) { + return 0; + } +}; + +} // end namespace internal + +// Fixed size +template +struct Sizes { + typedef internal::numeric_list Base; + const Base t = Base(); + static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); + static const ptrdiff_t count = Base::count; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { return Base::count; } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() { return internal::arg_prod(Indices...); } + + EIGEN_DEVICE_FUNC Sizes() {} + template + explicit EIGEN_DEVICE_FUNC Sizes(const array& /*indices*/) { + // todo: add assertion + } + template + EIGEN_DEVICE_FUNC Sizes(DenseIndex...) {} + explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list /*l*/) { + // todo: add assertion + } + + template + Sizes& operator=(const T& /*other*/) { + // add assertion failure if the size of other is different + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[](const std::ptrdiff_t index) const { + return internal::fixed_size_tensor_index_extraction_helper::run(index, t); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run( + indices, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run( + indices, t); + } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { + return Sizes::total_size; +} +} // namespace internal + +// Boilerplate +namespace internal { +template +struct tensor_index_linearization_helper { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array const& indices, + array const& dimensions) { + return array_get < RowMajor ? n + : (NumIndices - n - 1) > (indices) + array_get < RowMajor + ? n + : (NumIndices - n - 1) > + (dimensions)*tensor_index_linearization_helper::run( + indices, dimensions); + } +}; + +template +struct tensor_index_linearization_helper { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array const& indices, + array const&) { + return array_get < RowMajor ? 0 : NumIndices - 1 > (indices); + } +}; +} // end namespace internal + +// Dynamic size +template +struct DSizes : array { + typedef array Base; + static const int count = NumDims; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumDims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const { + return (NumDims == 0) ? 1 : internal::array_prod(*static_cast(this)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = 0; + } + } + EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) {} + + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + eigen_assert(NumDims == 1); + (*this)[0] = i0; + } + + EIGEN_DEVICE_FUNC DSizes(const DimensionList& a) { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + + // Enable DSizes index type promotion only if we are promoting to the + // larger type, e.g. allow to promote dimensions of type int to long. + template + EIGEN_DEVICE_FUNC explicit DSizes( + const array& other, + // Default template parameters require c++11. + std::enable_if_t< + internal::is_same::type>::value, + void*> = 0) { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = static_cast(other[i]); + } + } + + template + EIGEN_DEVICE_FUNC explicit DSizes(const Eigen::IndexList& dimensions) { + for (int i = 0; i < dimensions.count; ++i) { + (*this)[i] = dimensions[i]; + } + } + + template + EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, + IndexTypes... otherDimensions) + : Base({{firstDimension, secondDimension, otherDimensions...}}) { + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + EIGEN_DEVICE_FUNC DSizes& operator=(const array& other) { + *static_cast(this) = other; + return *this; + } + + // A constexpr would be so much better here + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run( + indices, *static_cast(this)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run( + indices, *static_cast(this)); + } +}; + +template +std::ostream& operator<<(std::ostream& os, const DSizes& dims) { + os << "["; + for (int i = 0; i < NumDims; ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} + +// Boilerplate +namespace internal { +template +struct tensor_vsize_index_linearization_helper { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array const& indices, + std::vector const& dimensions) { + return array_get < RowMajor ? n + : (NumIndices - n - 1) > (indices) + array_get < RowMajor + ? n + : (NumIndices - n - 1) > + (dimensions)*tensor_vsize_index_linearization_helper::run( + indices, dimensions); + } +}; + +template +struct tensor_vsize_index_linearization_helper { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array const& indices, + std::vector const&) { + return array_get < RowMajor ? 0 : NumIndices - 1 > (indices); + } +}; +} // end namespace internal + +namespace internal { + +template +struct array_size > { + static const ptrdiff_t value = NumDims; +}; +template +struct array_size > { + static const ptrdiff_t value = NumDims; +}; +template +struct array_size > { + static const std::ptrdiff_t value = Sizes::count; +}; +template +struct array_size > { + static const std::ptrdiff_t value = Sizes::count; +}; +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { + return get >::value; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { + eigen_assert(false && "should never be called"); + return -1; +} + +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return false; } +}; +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) { + return numext::equal_strict(array_get(dims1), array_get(dims2)) && + sizes_match_below_dim::run(dims1, dims2); + } +}; +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return true; } +}; + +} // end namespace internal + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) { + return internal::sizes_match_below_dim::value, + internal::array_size::value>::run(dims1, dims2); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h new file mode 100644 index 00000000000..8f1e4c4ba0b --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -0,0 +1,196 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template class MakePointer_> +struct traits > { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename MakePointer_::Type PointerType; + + enum { Flags = 0 }; + template + struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> { + typedef const TensorEvalToOp& type; +}; + +template class MakePointer_> +struct nested, 1, typename eval >::type> { + typedef TensorEvalToOp type; +}; + +} // end namespace internal + +template class MakePointer_> +class TensorEvalToOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename MakePointer_::Type PointerType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + static constexpr int NumDims = Eigen::internal::traits::NumDimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) + : m_xpr(expr), m_buffer(buffer) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; } + + protected: + typename XprType::Nested m_xpr; + PointerType m_buffer; +}; + +template class MakePointer_> +struct TensorEvaluator, Device> { + typedef TensorEvalToOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef std::remove_const_t CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = true, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = true + }; + + static constexpr int Layout = TensorEvaluator::Layout; + static constexpr int NumDims = internal::traits::NumDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + typedef internal::TensorBlockAssignment + TensorBlockAssignment; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()) {} + + EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { + EIGEN_UNUSED_VARIABLE(scalar); + eigen_assert(scalar == NULL); + return m_impl.evalSubExprsIfNeeded(m_buffer); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType scalar, EvalSubExprsCallback done) { + EIGEN_UNUSED_VARIABLE(scalar); + eigen_assert(scalar == NULL); + m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const { m_buffer[i] = m_impl.coeff(i); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const { + internal::pstoret( + m_buffer + i, m_impl.template packet < TensorEvaluator::IsAligned ? Aligned : Unaligned > (i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return m_impl.getResourceRequirements(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlockDesc& desc, TensorBlockScratch& scratch) { + // Add `m_buffer` as destination buffer to the block descriptor. + desc.template AddDestinationBuffer( + /*dst_base=*/m_buffer + desc.offset(), + /*dst_strides=*/internal::strides(m_impl.dimensions())); + + ArgTensorBlock block = m_impl.block(desc, scratch, /*root_of_expr_ast=*/true); + + // If block was evaluated into a destination buffer, there is no need to do + // an assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + TensorBlockAssignment::Run( + TensorBlockAssignment::target(desc.dimensions(), internal::strides(m_impl.dimensions()), m_buffer, + desc.offset()), + block.expr()); + } + block.cleanup(); + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_buffer + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here. + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; } + ArgType expression() const { return m_expression; } + + private: + TensorEvaluator m_impl; + EvaluatorPointerType m_buffer; + const ArgType m_expression; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h new file mode 100644 index 00000000000..d864fb46402 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -0,0 +1,859 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor evaluator classes. + * + * These classes are responsible for the evaluation of the tensor expression. + * + * TODO: add support for more types of expressions, in particular expressions + * leading to lvalues (slicing, reshaping, etc...) + */ + +// Generic evaluator +template +struct TensorEvaluator { + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + typedef Derived XprType; + static constexpr int PacketSize = PacketType::size; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + // NumDimensions is -1 for variable dim tensors + static constexpr int NumCoords = + internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; + static constexpr int Layout = Derived::Layout; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic>::value, + PreferBlockAccess = false, + CoordAccess = NumCoords > 0, + RawAccess = true + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get((const_cast(m.data())))), m_dims(m.dimensions()), m_device(device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { + if (!NumTraits>::RequireInitialization && dest) { + m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); + return false; + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data != NULL); + return m_data[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + eigen_assert(m_data != NULL); + return m_data[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_data + index); + } + + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + std::enable_if_t::masked_load_available, PacketReturnTypeT> + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const { + return internal::ploadu(m_data + index, umask); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + return internal::pstoret(m_data + index, x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data != NULL); + if (static_cast(Layout) == static_cast(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array& coords) const { + eigen_assert(m_data != NULL); + if (static_cast(Layout) == static_cast(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType::size); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) { + eigen_assert(m_data != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), internal::strides(m_dims), m_data, desc.offset()), + block.expr()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } + + protected: + EvaluatorPointerType m_data; + Dimensions m_dims; + const Device EIGEN_DEVICE_REF m_device; +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) { + return *address; +} +// Use the texture cache on CUDA devices whenever possible +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) { + return __ldg(address); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double loadConstant(const double* address) { + return __ldg(address); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half loadConstant(const Eigen::half* address) { + return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); +} +#endif + +} // namespace internal + +// Default evaluator for rvalues +template +struct TensorEvaluator { + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + typedef const Derived XprType; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + typedef std::remove_const_t ScalarNoConst; + + // NumDimensions is -1 for variable dim tensors + static constexpr int NumCoords = + internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; + static constexpr int PacketSize = PacketType::size; + static constexpr int Layout = Derived::Layout; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + PreferBlockAccess = false, + CoordAccess = NumCoords > 0, + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + if (!NumTraits>::RequireInitialization && data) { + m_device.memcpy((void*)(m_device.get(data)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); + return false; + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data != NULL); + return internal::loadConstant(m_data + index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt_ro(m_data + index); + } + + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + std::enable_if_t::masked_load_available, PacketReturnTypeT> + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const { + return internal::ploadu(m_data + index, umask); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data != NULL); + const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) + : m_dims.IndexOfRowMajor(coords); + return internal::loadConstant(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType::size); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } + + protected: + EvaluatorPointerType m_data; + Dimensions m_dims; + const Device EIGEN_DEVICE_REF m_device; +}; + +// -------------------- CwiseNullaryOp -------------------- + +template +struct TensorEvaluator, Device> { + typedef TensorCwiseNullaryOp XprType; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess +#ifdef EIGEN_USE_SYCL + && (PacketType::size > 1) +#endif + , + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + done(true); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_wrapper(m_functor, index); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_wrapper.template packetOp(m_functor, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType::size); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + const NullaryOp m_functor; + TensorEvaluator m_argImpl; + const internal::nullary_wrapper m_wrapper; +}; + +// -------------------- CwiseUnaryOp -------------------- + +template +struct TensorEvaluator, Device> { + typedef TensorCwiseUnaryOp XprType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = + int(TensorEvaluator::PacketAccess) & int(internal::functor_traits::PacketAccess), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef std::remove_const_t ScalarNoConst; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static constexpr int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + typedef internal::TensorCwiseUnaryBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_argImpl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_functor.packetOp(m_argImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.costPerCoeff(vectorized) + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.getResourceRequirements().addCostPerCoeff({0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_argImpl.block(desc, scratch), m_functor); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + const Device EIGEN_DEVICE_REF m_device; + const UnaryOp m_functor; + TensorEvaluator m_argImpl; +}; + +// -------------------- CwiseBinaryOp -------------------- + +template +struct TensorEvaluator, Device> { + typedef TensorCwiseBinaryOp XprType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = + int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_functor(op.functor()), + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout) || + internal::traits::NumDimensions <= 1), + YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int NumDims = internal::array_size::Dimensions>::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock LeftTensorBlock; + typedef typename TensorEvaluator::TensorBlock RightTensorBlock; + + typedef internal::TensorCwiseBinaryBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { + // TODO: use right impl instead if right impl dimensions are known at compile time. + return m_leftImpl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + // TODO(ezhulenev): Evaluate two expression in parallel? + m_leftImpl.evalSubExprsIfNeededAsync( + nullptr, [this, done](bool) { m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { + return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_functor.packetOp(m_leftImpl.template packet(index), + m_rightImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_leftImpl.costPerCoeff(vectorized) + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits::Cost; + return internal::TensorBlockResourceRequirements::merge(m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()) + .addCostPerCoeff({0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + desc.DropDestinationBuffer(); + return TensorBlock(m_leftImpl.block(desc, scratch), m_rightImpl.block(desc, scratch), m_functor); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + const Device EIGEN_DEVICE_REF m_device; + const BinaryOp m_functor; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + +// -------------------- CwiseTernaryOp -------------------- + +template +struct TensorEvaluator, Device> { + typedef TensorCwiseTernaryOp XprType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && internal::functor_traits::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_arg1Impl(op.arg1Expression(), device), + m_arg2Impl(op.arg2Expression(), device), + m_arg3Impl(op.arg3Expression(), device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout) || + internal::traits::NumDimensions <= 1), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + + eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && + dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { + // TODO: use arg2 or arg3 dimensions if they are known at compile time. + return m_arg1Impl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_arg1Impl.evalSubExprsIfNeeded(NULL); + m_arg2Impl.evalSubExprsIfNeeded(NULL); + m_arg3Impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_arg1Impl.cleanup(); + m_arg2Impl.cleanup(); + m_arg3Impl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { + return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_functor.packetOp(m_arg1Impl.template packet(index), m_arg2Impl.template packet(index), + m_arg3Impl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_arg1Impl.costPerCoeff(vectorized) + m_arg2Impl.costPerCoeff(vectorized) + + m_arg3Impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + const TernaryOp m_functor; + TensorEvaluator m_arg1Impl; + TensorEvaluator m_arg2Impl; + TensorEvaluator m_arg3Impl; +}; + +// -------------------- SelectOp -------------------- + +template +struct TensorEvaluator, Device> { + typedef TensorSelectOp XprType; + typedef typename XprType::Scalar Scalar; + + using TernarySelectOp = internal::scalar_boolean_select_op::Scalar, + typename internal::traits::Scalar, + typename internal::traits::Scalar>; + static constexpr bool TernaryPacketAccess = + TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && internal::functor_traits::PacketAccess; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = (TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && PacketType::HasBlend) || + TernaryPacketAccess, + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_condImpl(op.ifExpression(), device), + m_thenImpl(op.thenExpression(), device), + m_elseImpl(op.elseExpression(), device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock IfArgTensorBlock; + typedef typename TensorEvaluator::TensorBlock ThenArgTensorBlock; + typedef typename TensorEvaluator::TensorBlock ElseArgTensorBlock; + + struct TensorSelectOpBlockFactory { + template + struct XprType { + typedef TensorSelectOp type; + }; + + template + typename XprType::type expr(const IfArgXprType& if_expr, + const ThenArgXprType& then_expr, + const ElseArgXprType& else_expr) const { + return typename XprType::type(if_expr, then_expr, else_expr); + } + }; + + typedef internal::TensorTernaryExprBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { + // TODO: use then or else impl instead if they happen to be known at compile time. + return m_condImpl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_condImpl.evalSubExprsIfNeeded(NULL); + m_thenImpl.evalSubExprsIfNeeded(NULL); + m_elseImpl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { + m_thenImpl.evalSubExprsIfNeeded( + nullptr, [this, done](bool) { m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); }); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_condImpl.cleanup(); + m_thenImpl.cleanup(); + m_elseImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { + return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); + } + + template = true> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + internal::Selector select; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index + i); + } + return internal::pblend(select, m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); + } + + template = true> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return TernarySelectOp().template packetOp(m_thenImpl.template packet(index), + m_elseImpl.template packet(index), + m_condImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_condImpl.costPerCoeff(vectorized) + + m_thenImpl.costPerCoeff(vectorized).cwiseMax(m_elseImpl.costPerCoeff(vectorized)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + auto then_req = m_thenImpl.getResourceRequirements(); + auto else_req = m_elseImpl.getResourceRequirements(); + + auto merged_req = internal::TensorBlockResourceRequirements::merge(then_req, else_req); + merged_req.cost_per_coeff = then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff); + + return internal::TensorBlockResourceRequirements::merge(m_condImpl.getResourceRequirements(), merged_req); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // It's unsafe to pass destination buffer to underlying expressions, because + // output might be aliased with one of the inputs. + desc.DropDestinationBuffer(); + + return TensorBlock(m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch), + m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler& cgh) const { + m_condImpl.bind(cgh); + m_thenImpl.bind(cgh); + m_elseImpl.bind(cgh); + } +#endif + private: + TensorEvaluator m_condImpl; + TensorEvaluator m_thenImpl; + TensorEvaluator m_elseImpl; +}; + +} // end namespace Eigen + +#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP) +template +struct cl::sycl::is_device_copyable< + Eigen::TensorEvaluator, + std::enable_if_t>::value>> : std::true_type {}; +#endif + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 00000000000..fe4b38c4813 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,672 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** + * \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + * + * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and + * instructions) + * @tparam Tiling can use block based tensor evaluation + * (see TensorBlock.h) + */ +namespace internal { + +/** + * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely + * expensive. If expression has at least one broadcast op in it, and it supports + * block based evaluation, we always prefer it, even for the small tensors. For + * all other tileable ops, block evaluation overhead for small tensors (fits + * into L1) is too large, and we fallback on vectorized evaluation. + */ + +// TODO(ezhulenev): Add specializations for all other types of Tensor ops. + +template +struct ExpressionHasTensorBroadcastingOp { + enum { value = false }; +}; + +template +struct ExpressionHasTensorBroadcastingOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp > { + enum { + value = ExpressionHasTensorBroadcastingOp::value || ExpressionHasTensorBroadcastingOp::value + }; +}; + +template +struct ExpressionHasTensorBroadcastingOp > { + enum { value = true }; +}; + +// -------------------------------------------------------------------------- // + +/** + * Default strategy: the expression is evaluated sequentially with a single cpu + * thread, without vectorization and block evaluation. + */ +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + + // Including `unsupported/Eigen/CXX11/Tensor` in different translation units + // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR + // violation. If this template is instantiated with a non-default device, it + // means that this header file was included without defining + // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`. + static_assert(std::is_same::value, + "Default executor instantiated with non-default device. " + "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " + "EIGEN_USE_SYCL before including Eigen headers."); + + static EIGEN_STRONG_INLINE void run(const Expression& expr, const Device& device = DefaultDevice()) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + for (StorageIndex i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + +/** + * Default async execution strategy is not implemented. Currently it's only + * available for ThreadPoolDevice (see definition below). + */ +template +class TensorAsyncExecutor {}; + +/** + * Process all the data with a single cpu thread, using vectorized instructions. + */ +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + const int PacketSize = + unpacket_traits::PacketReturnType>::size; + + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not + // unroll the loop at the expense of inlining. + const StorageIndex UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; + for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (StorageIndex i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + +/** + * Process all the data with a single cpu thread, using blocks of data. By + * sizing a block to fit L1 cache we get better cache performance. + */ +template +class TensorExecutor { + public: + typedef typename traits::Scalar Scalar; + typedef std::remove_const_t ScalarNoConst; + + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; + + static constexpr int NumDims = traits::NumDimensions; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + typedef TensorBlockMapper TensorBlockMapper; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + Evaluator evaluator(expr, device); + + // TODO(ezhulenev): Do not use tiling for small tensors? + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + + if (needs_assign) { + // Query expression tree for desired block size/shape. + const TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements(); + + const TensorBlockMapper block_mapper(typename TensorBlockDesc::Dimensions(evaluator.dimensions()), requirements); + + // Share scratch memory allocator between all blocks. + TensorBlockScratch scratch(device); + + const StorageIndex total_block_count = block_mapper.blockCount(); + for (StorageIndex i = 0; i < total_block_count; ++i) { + TensorBlockDesc desc = block_mapper.blockDescriptor(i); + evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + } + evaluator.cleanup(); + } +}; + +/** + * Multicore strategy: the index space is partitioned and each partition is + * executed on a single core. + * + * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread + * pool, and will block the caller thread until all tasks are finished. + * + * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to + * the ThreadPoolDevice managed thread pool, and will return immediately. + * It will call 'done' callback after all tasks are finished. + */ +#ifdef EIGEN_USE_THREADS + +template +struct TensorExecutorTilingContext { + TensorExecutorTilingContext() = default; + TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, const TensorOpCost& b_cost, size_t b_aligned_size) + : block_mapper(b_mapper), cost(b_cost), aligned_blocksize(b_aligned_size) {} + + TensorBlockMapper block_mapper; // navigate through blocks + TensorOpCost cost; // cost of computing a single block + size_t aligned_blocksize; // block size after memory alignment +}; + +// Computes a block evaluation parameters, and allocates temporary memory buffer +// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. +template +TensorExecutorTilingContext GetTensorExecutorTilingContext(const Evaluator& evaluator) { + // Query expression tree for desired block size/shape. + TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements(); + + // Update target block size based on cost model. + double taskSize = TensorCostModel::taskSize(1, requirements.cost_per_coeff); + requirements.size = static_cast(1.0 / taskSize); + + TensorBlockMapper block_mapper(typename TensorBlockMapper::Dimensions(evaluator.dimensions()), requirements); + + size_t block_size = block_mapper.blockTotalSize(); + const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + const size_t aligned_blocksize = + align * numext::div_ceil(block_size * sizeof(typename Evaluator::Scalar), align); + + return {block_mapper, requirements.cost_per_coeff * block_size, aligned_blocksize}; +} + +template +struct EvalRange { + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) { + Evaluator evaluator = *evaluator_in; + eigen_assert(lastIdx >= firstIdx); + for (StorageIndex i = firstIdx; i < lastIdx; ++i) { + evaluator.evalScalar(i); + } + } + + static StorageIndex alignBlockSize(StorageIndex size) { return size; } +}; + +template +struct EvalRange { + static constexpr int PacketSize = unpacket_traits::size; + + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) { + Evaluator evaluator = *evaluator_in; + eigen_assert(lastIdx >= firstIdx); + StorageIndex i = firstIdx; + if (lastIdx - firstIdx >= PacketSize) { + eigen_assert(firstIdx % PacketSize == 0); + StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize; + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not + // unroll the loop at the expense of inlining. + for (; i <= last_chunk_offset; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + last_chunk_offset = lastIdx - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { + evaluator.evalPacket(i); + } + } + for (; i < lastIdx; ++i) { + evaluator.evalScalar(i); + } + } + + static StorageIndex alignBlockSize(StorageIndex size) { + // Align block size to packet size and account for unrolling in run above. + if (size >= 16 * PacketSize) { + return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); + } + // Aligning to 4 * PacketSize would increase block size by more than 25%. + return (size + PacketSize - 1) & ~(PacketSize - 1); + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { + typedef TensorEvaluator Evaluator; + typedef EvalRange EvalRange; + + Evaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + device.parallelFor( + size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize, + [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&evaluator, firstIdx, lastIdx); }); + } + evaluator.cleanup(); + } +}; + +template +class TensorExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef std::remove_const_t ScalarNoConst; + + static constexpr int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { + Evaluator evaluator(expr, device); + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const TilingContext tiling = + internal::GetTensorExecutorTilingContext(evaluator); + + auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, IndexType lastBlockIdx) { + TensorBlockScratch scratch(device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) { + TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx); + evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + }; + + // Evaluate small expressions directly as a single block. + if (tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(device); + TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions()); + evaluator.evalBlock(desc, scratch); + } else { + device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, eval_block); + } + } + evaluator.cleanup(); + } +}; + +template +class TensorAsyncExecutor { + public: + typedef typename Expression::Index StorageIndex; + typedef TensorEvaluator Evaluator; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) { + TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done)); + + const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } + + typedef EvalRange EvalRange; + const StorageIndex size = array_prod(ctx->evaluator.dimensions()); + device.parallelForAsync( + size, ctx->evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize, + [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); }, + [ctx]() { delete ctx; }); + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done) + : evaluator(expr, thread_pool), on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + evaluator.cleanup(); + on_done(); + } + + Evaluator evaluator; + + private: + DoneCallback on_done; + }; +}; + +template +class TensorAsyncExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef std::remove_const_t ScalarNoConst; + + static constexpr int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) { + TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done)); + + const auto on_eval_subexprs = [ctx](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } + + ctx->tiling = internal::GetTensorExecutorTilingContext(ctx->evaluator); + + auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) { + TensorBlockScratch scratch(ctx->device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) { + TensorBlockDesc desc = ctx->tiling.block_mapper.blockDescriptor(block_idx); + ctx->evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + }; + + // Evaluate small expressions directly as a single block. + if (ctx->tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(ctx->device); + TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions()); + ctx->evaluator.evalBlock(desc, scratch); + delete ctx; + } else { + ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), ctx->tiling.cost, eval_block, + [ctx]() { delete ctx; }); + } + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done) + : device(thread_pool), evaluator(expr, thread_pool), on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + evaluator.cleanup(); + on_done(); + } + + const ThreadPoolDevice& device; + Evaluator evaluator; + TilingContext tiling; + + private: + DoneCallback on_done; + }; +}; + +#endif // EIGEN_USE_THREADS + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) + +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + static void run(const Expression& expr, const GpuDevice& device); +}; + +#if defined(EIGEN_GPUCC) +// Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0. +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs, Index rhs) { + const Index highest = NumTraits::highest(); + const Index lowest = NumTraits::lowest(); + if (lhs > 0 && rhs > 0) { + return lhs > highest - rhs ? 1 : 0; + } else if (lhs < 0 && rhs < 0) { + return lhs < lowest - rhs ? -1 : 0; + } else { + return 0; + } +} + +// Returns lhs + rhs, saturating to the highest/lowest representable value on +// overflow/underflow respectively. +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) { + const Index highest = NumTraits::highest(); + const Index lowest = NumTraits::lowest(); + int overflow = sum_will_overflow(lhs, rhs); + return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs; +} + +// A functor that adds step_size to a given index, saturating to avoid +// overflow/underflow. If overflow/underflow is not possible, regular addition +// is used (for efficiency). +template +struct SafeStep { + // lastIdx is one past the end of the possible indexes. + // step_size is the value that will be added to the given index when the + // functor is called. + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size) + : can_overflow_(sum_will_overflow(lastIdx, step_size)), step_size_(step_size) {} + + // Adds step_size to index, saturating on overflow (if overflow is possible). + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const { + return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_; + } + + private: + const bool can_overflow_; + const Index step_size_; +}; + +template +struct EigenMetaKernelEval { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, + StorageIndex step_size) { + SafeStep safe_step(lastIdx, step_size); + for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) { + eval.evalScalar(i); + } + } +}; + +template +struct EigenMetaKernelEval { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, + StorageIndex step_size) { + const StorageIndex PacketSize = unpacket_traits::size; + const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize; + const StorageIndex vectorized_step_size = step_size * PacketSize; + + SafeStep safe_vectorized_step(vectorized_size, vectorized_step_size); + // Use the vector path + for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; i = safe_vectorized_step(i)) { + eval.evalPacket(i); + } + SafeStep safe_step(lastIdx, step_size); + for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx; i = safe_step(i)) { + eval.evalScalar(i); + } + } +}; + +template +__global__ void __launch_bounds__(1024) EigenMetaKernel(Evaluator eval, StorageIndex size) { + const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; + const StorageIndex step_size = blockDim.x * gridDim.x; + + const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; + EigenMetaKernelEval::run(eval, first_index, size, step_size); +} + +/*static*/ +template +EIGEN_STRONG_INLINE void TensorExecutor::run(const Expression& expr, + const GpuDevice& device) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const int block_size = device.maxGpuThreadsPerBlock(); + const int max_blocks = static_cast( + numext::mini(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(), + NumTraits::highest()) / + block_size); + const StorageIndex size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi( + numext::mini(max_blocks, static_cast(numext::div_ceil(size, block_size))), 1); + + LAUNCH_GPU_KERNEL((EigenMetaKernel, StorageIndex>), num_blocks, block_size, + 0, device, evaluator, size); + } + evaluator.cleanup(); +} + +#endif // EIGEN_GPUCC +#endif // EIGEN_USE_GPU + +// SYCL Executor policy +#ifdef EIGEN_USE_SYCL + +template +struct ExecExprFunctorKernel { + typedef typename Evaluator::Index Index; + Evaluator evaluator; + const Index range; + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(const Scratch, Evaluator evaluator_, const Index range_) + : evaluator(evaluator_), range(range_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { compute(itemID); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t compute(const cl::sycl::nd_item<1>& itemID) const { + Index gId = static_cast(itemID.get_global_linear_id()); + Index total_threads = itemID.get_global_range(0); + + for (Index i = gId; i < range; i += total_threads) { + evaluator.evalScalar(i); + } + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t compute(const cl::sycl::nd_item<1>& itemID) const { + const Index vectorizedRange = (range / Evaluator::PacketSize) * Evaluator::PacketSize; + Index gId = static_cast(itemID.get_global_linear_id()); + const Index step = Evaluator::PacketSize * itemID.get_global_range(0); + const Index start = Evaluator::PacketSize * gId; + for (Index i = start; i < vectorizedRange; i += step) { + evaluator.evalPacket(i); + } + gId += vectorizedRange; + for (Index i = gId; i < range; i += itemID.get_global_range(0)) { + evaluator.evalScalar(i); + } + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static EIGEN_STRONG_INLINE void run(const Expression& expr, const Eigen::SyclDevice& dev) { + typedef Eigen::TensorEvaluator Evaluator; + Evaluator evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + Index range, GRange, tileSize; + Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); + total_size = (total_size == 0) ? 1 : total_size; + const int PacketSize = Eigen::PacketType::size; + Index vectorizable_threads = static_cast(total_size / PacketSize); + dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); + range = total_size; + + dev.template nullary_kernel_launcher >( + evaluator, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1), + range) + .wait(); + } + evaluator.cleanup(); + } +}; + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h new file mode 100644 index 00000000000..5f4c72367c6 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -0,0 +1,329 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorExpr + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor expression classes. + * + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. + * This is typically used to generate constants. + * + * The TensorCwiseUnaryOp class represents an expression where a unary operator + * (e.g. cwiseSqrt) is applied to an expression. + * + * The TensorCwiseBinaryOp class represents an expression where a binary + * operator (e.g. addition) is applied to a lhs and a rhs expression. + * + */ +namespace internal { +template +struct traits > : traits { + typedef traits XprTraits; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested XprTypeNested; + typedef std::remove_reference_t XprTypeNested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + enum { Flags = 0 }; +}; + +} // end namespace internal + +template +class TensorCwiseNullaryOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef TensorCwiseNullaryOp Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; } + + protected: + typename XprType::Nested m_xpr; + const NullaryOp m_functor; +}; + +namespace internal { +template +struct traits > : traits { + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename result_of::type Scalar; + typedef traits XprTraits; + typedef typename XprType::Nested XprTypeNested; + typedef std::remove_reference_t XprTypeNested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename TypeConversion::type PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorCwiseUnaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorCwiseUnaryOp type; +}; + +} // end namespace internal + +template +class TensorCwiseUnaryOp : public TensorBase, ReadOnlyAccessors> { + public: + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC const UnaryOp& functor() const { return m_functor; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC const internal::remove_all_t& nestedExpression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const UnaryOp m_functor; +}; + +namespace internal { +template +struct traits > { + // Type promotion to handle the case where the types of the lhs and the rhs + // are different. + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename result_of::type Scalar; + typedef traits XprTraits; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType> >::type PointerType; + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorCwiseBinaryOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorCwiseBinaryOp type; +}; + +} // end namespace internal + +template +class TensorCwiseBinaryOp + : public TensorBase, ReadOnlyAccessors> { + public: + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, + const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + + EIGEN_DEVICE_FUNC const BinaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC const internal::remove_all_t& lhsExpression() const { + return m_lhs_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& rhsExpression() const { + return m_rhs_xpr; + } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const BinaryOp m_functor; +}; + +namespace internal { +template +struct traits > { + // Type promotion to handle the case where the types of the args are different. + typedef typename result_of::type Scalar; + typedef traits XprTraits; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename Arg1XprType::Nested Arg1Nested; + typedef typename Arg2XprType::Nested Arg2Nested; + typedef typename Arg3XprType::Nested Arg3Nested; + typedef std::remove_reference_t Arg1Nested_; + typedef std::remove_reference_t Arg2Nested_; + typedef std::remove_reference_t Arg3Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType> >::type PointerType; + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorCwiseTernaryOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorCwiseTernaryOp type; +}; + +} // end namespace internal + +template +class TensorCwiseTernaryOp + : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, + const Arg3XprType& arg3, + const TernaryOp& func = TernaryOp()) + : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {} + + EIGEN_DEVICE_FUNC const TernaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC const internal::remove_all_t& arg1Expression() const { + return m_arg1_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& arg2Expression() const { + return m_arg2_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& arg3Expression() const { + return m_arg3_xpr; + } + + protected: + typename Arg1XprType::Nested m_arg1_xpr; + typename Arg2XprType::Nested m_arg2_xpr; + typename Arg3XprType::Nested m_arg3_xpr; + const TernaryOp m_functor; +}; + +namespace internal { +template +struct traits > : traits { + typedef typename traits::Scalar Scalar; + typedef traits XprTraits; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename IfXprType::Nested IfNested; + typedef typename ThenXprType::Nested ThenNested; + typedef typename ElseXprType::Nested ElseNested; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef std::conditional_t::val, + typename traits::PointerType, typename traits::PointerType> + PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorSelectOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorSelectOp type; +}; + +} // end namespace internal + +template +class TensorSelectOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC TensorSelectOp(const IfXprType& a_condition, const ThenXprType& a_then, const ElseXprType& a_else) + : m_condition(a_condition), m_then(a_then), m_else(a_else) {} + + EIGEN_DEVICE_FUNC const IfXprType& ifExpression() const { return m_condition; } + + EIGEN_DEVICE_FUNC const ThenXprType& thenExpression() const { return m_then; } + + EIGEN_DEVICE_FUNC const ElseXprType& elseExpression() const { return m_else; } + + protected: + typename IfXprType::Nested m_condition; + typename ThenXprType::Nested m_then; + typename ElseXprType::Nested m_else; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h new file mode 100644 index 00000000000..f5d1b2da357 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -0,0 +1,674 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Jianwei Cui +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H +#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorFFT + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor FFT class. + * + * TODO: + * Vectorize the Cooley Tukey and the Bluestein algorithm + * Add support for multithreaded evaluation + * Improve the performance on GPU + */ + +template +struct MakeComplex { + template + EIGEN_DEVICE_FUNC T operator()(const T& val) const { + return val; + } +}; + +template <> +struct MakeComplex { + template + EIGEN_DEVICE_FUNC std::complex operator()(const T& val) const { + return std::complex(val, 0); + } +}; + +template <> +struct MakeComplex { + template + EIGEN_DEVICE_FUNC std::complex operator()(const std::complex& val) const { + return val; + } +}; + +template +struct PartOf { + template + T operator()(const T& val) const { + return val; + } +}; + +template <> +struct PartOf { + template + T operator()(const std::complex& val) const { + return val.real(); + } +}; + +template <> +struct PartOf { + template + T operator()(const std::complex& val) const { + return val.imag(); + } +}; + +namespace internal { +template +struct traits > : public traits { + typedef traits XprTraits; + typedef typename NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename XprTraits::Scalar InputScalar; + typedef std::conditional_t + OutputScalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename traits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorFFTOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorFFTOp type; +}; + +} // end namespace internal + +template +class TensorFFTOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef std::conditional_t + OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) : m_xpr(expr), m_fft(fft) {} + + EIGEN_DEVICE_FUNC const FFT& fft() const { return m_fft; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const FFT m_fft; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorFFTOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef internal::traits XprTraits; + typedef typename XprTraits::Scalar InputScalar; + typedef std::conditional_t + OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = true, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + } + } + m_size = m_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (data) { + evalToBuf(data); + return false; + } else { + m_data = (EvaluatorPointerType)m_device.get( + (CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size))); + evalToBuf(m_data); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_data) { + m_device.deallocate(m_data); + m_data = NULL; + } + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { return m_data[index]; } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) { + const bool write_to_out = internal::is_same::value; + ComplexScalar* buf = + write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); + + for (Index i = 0; i < m_size; ++i) { + buf[i] = MakeComplex::value>()(m_impl.coeff(i)); + } + + for (size_t i = 0; i < m_fft.size(); ++i) { + Index dim = m_fft[i]; + eigen_assert(dim >= 0 && dim < NumDims); + Index line_len = m_dimensions[dim]; + eigen_assert(line_len >= 1); + ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len); + const bool is_power_of_two = isPowerOfTwo(line_len); + const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len); + const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite); + + ComplexScalar* a = + is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* b = + is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* pos_j_base_powered = + is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1)); + if (!is_power_of_two) { + // Compute twiddle factors + // t_n = exp(sqrt(-1) * pi * n^2 / line_len) + // for n = 0, 1,..., line_len-1. + // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 + + // The recurrence is correct in exact arithmetic, but causes + // numerical issues for large transforms, especially in + // single-precision floating point. + // + // pos_j_base_powered[0] = ComplexScalar(1, 0); + // if (line_len > 1) { + // const ComplexScalar pos_j_base = ComplexScalar( + // numext::cos(M_PI / line_len), numext::sin(M_PI / line_len)); + // pos_j_base_powered[1] = pos_j_base; + // if (line_len > 2) { + // const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + // for (int i = 2; i < line_len + 1; ++i) { + // pos_j_base_powered[i] = pos_j_base_powered[i - 1] * + // pos_j_base_powered[i - 1] / + // pos_j_base_powered[i - 2] * + // pos_j_base_sq; + // } + // } + // } + // TODO(rmlarsen): Find a way to use Eigen's vectorized sin + // and cosine functions here. + for (int j = 0; j < line_len + 1; ++j) { + double arg = ((EIGEN_PI * j) * j) / line_len; + std::complex tmp(numext::cos(arg), numext::sin(arg)); + pos_j_base_powered[j] = static_cast(tmp); + } + } + + for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { + const Index base_offset = getBaseOffsetFromIndex(partial_index, dim); + + // get data into line_buf + const Index stride = m_strides[dim]; + if (stride == 1) { + m_device.memcpy(line_buf, &buf[base_offset], line_len * sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + for (int j = 0; j < line_len; ++j, offset += stride) { + line_buf[j] = buf[offset]; + } + } + + // process the line + if (is_power_of_two) { + processDataLineCooleyTukey(line_buf, line_len, log_len); + } else { + processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered); + } + + // write back + if (FFTDir == FFT_FORWARD && stride == 1) { + m_device.memcpy(&buf[base_offset], line_buf, line_len * sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); + for (int j = 0; j < line_len; ++j, offset += stride) { + buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor; + } + } + } + m_device.deallocate(line_buf); + if (!is_power_of_two) { + m_device.deallocate(a); + m_device.deallocate(b); + m_device.deallocate(pos_j_base_powered); + } + } + + if (!write_to_out) { + for (Index i = 0; i < m_size; ++i) { + data[i] = PartOf()(buf[i]); + } + m_device.deallocate(buf); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) { + eigen_assert(x > 0); + return !(x & (x - 1)); + } + + // The composite number for padding, used in Bluestein's FFT algorithm + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) { + Index i = 2; + while (i < 2 * n - 1) i *= 2; + return i; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) { + Index log2m = 0; + while (m >>= 1) log2m++; + return log2m; + } + + // Call Cooley Tukey algorithm directly, data length must be power of 2 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, + Index log_len) { + eigen_assert(isPowerOfTwo(line_len)); + scramble_FFT(line_buf, line_len); + compute_1D_Butterfly(line_buf, line_len, log_len); + } + + // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, + Index good_composite, Index log_len, + ComplexScalar* a, ComplexScalar* b, + const ComplexScalar* pos_j_base_powered) { + Index n = line_len; + Index m = good_composite; + ComplexScalar* data = line_buf; + + for (Index i = 0; i < n; ++i) { + if (FFTDir == FFT_FORWARD) { + a[i] = data[i] * numext::conj(pos_j_base_powered[i]); + } else { + a[i] = data[i] * pos_j_base_powered[i]; + } + } + for (Index i = n; i < m; ++i) { + a[i] = ComplexScalar(0, 0); + } + + for (Index i = 0; i < n; ++i) { + if (FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[i]; + } else { + b[i] = numext::conj(pos_j_base_powered[i]); + } + } + for (Index i = n; i < m - n; ++i) { + b[i] = ComplexScalar(0, 0); + } + for (Index i = m - n; i < m; ++i) { + if (FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[m - i]; + } else { + b[i] = numext::conj(pos_j_base_powered[m - i]); + } + } + + scramble_FFT(a, m); + compute_1D_Butterfly(a, m, log_len); + + scramble_FFT(b, m); + compute_1D_Butterfly(b, m, log_len); + + for (Index i = 0; i < m; ++i) { + a[i] *= b[i]; + } + + scramble_FFT(a, m); + compute_1D_Butterfly(a, m, log_len); + + // Do the scaling after ifft + for (Index i = 0; i < m; ++i) { + a[i] /= m; + } + + for (Index i = 0; i < n; ++i) { + if (FFTDir == FFT_FORWARD) { + data[i] = a[i] * numext::conj(pos_j_base_powered[i]); + } else { + data[i] = a[i] * pos_j_base_powered[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) { + eigen_assert(isPowerOfTwo(n)); + Index j = 1; + for (Index i = 1; i < n; ++i) { + if (j > i) { + std::swap(data[j - 1], data[i - 1]); + } + Index m = n >> 1; + while (m >= 2 && j > m) { + j -= m; + m >>= 1; + } + j += m; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) { + ComplexScalar tmp = data[1]; + data[1] = data[0] - data[1]; + data[0] += tmp; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) { + ComplexScalar tmp[4]; + tmp[0] = data[0] + data[1]; + tmp[1] = data[0] - data[1]; + tmp[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); + } else { + tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); + } + data[0] = tmp[0] + tmp[2]; + data[1] = tmp[1] + tmp[3]; + data[2] = tmp[0] - tmp[2]; + data[3] = tmp[1] - tmp[3]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) { + ComplexScalar tmp_1[8]; + ComplexScalar tmp_2[8]; + + tmp_1[0] = data[0] + data[1]; + tmp_1[1] = data[0] - data[1]; + tmp_1[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); + } else { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); + } + tmp_1[4] = data[4] + data[5]; + tmp_1[5] = data[4] - data[5]; + tmp_1[6] = data[6] + data[7]; + if (Dir == FFT_FORWARD) { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); + } else { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); + } + tmp_2[0] = tmp_1[0] + tmp_1[2]; + tmp_2[1] = tmp_1[1] + tmp_1[3]; + tmp_2[2] = tmp_1[0] - tmp_1[2]; + tmp_2[3] = tmp_1[1] - tmp_1[3]; + tmp_2[4] = tmp_1[4] + tmp_1[6]; +// SQRT2DIV2 = sqrt(2)/2 +#define SQRT2DIV2 0.7071067811865476 + if (Dir == FFT_FORWARD) { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); + } else { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); + } + data[0] = tmp_2[0] + tmp_2[4]; + data[1] = tmp_2[1] + tmp_2[5]; + data[2] = tmp_2[2] + tmp_2[6]; + data[3] = tmp_2[3] + tmp_2[7]; + data[4] = tmp_2[0] - tmp_2[4]; + data[5] = tmp_2[1] - tmp_2[5]; + data[6] = tmp_2[2] - tmp_2[6]; + data[7] = tmp_2[3] - tmp_2[7]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(ComplexScalar* data, Index n, Index n_power_of_2) { + // Original code: + // RealScalar wtemp = std::sin(M_PI/n); + // RealScalar wpi = -std::sin(2 * M_PI/n); + const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; + const RealScalar wpi = + (Dir == FFT_FORWARD) ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; + + const ComplexScalar wp(wtemp, wpi); + const ComplexScalar wp_one = wp + ComplexScalar(1, 0); + const ComplexScalar wp_one_2 = wp_one * wp_one; + const ComplexScalar wp_one_3 = wp_one_2 * wp_one; + const ComplexScalar wp_one_4 = wp_one_3 * wp_one; + const Index n2 = n / 2; + ComplexScalar w(1.0, 0.0); + for (Index i = 0; i < n2; i += 4) { + ComplexScalar temp0(data[i + n2] * w); + ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); + ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); + ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3); + w = w * wp_one_4; + + data[i + n2] = data[i] - temp0; + data[i] += temp0; + + data[i + 1 + n2] = data[i + 1] - temp1; + data[i + 1] += temp1; + + data[i + 2 + n2] = data[i + 2] - temp2; + data[i + 2] += temp2; + + data[i + 3 + n2] = data[i + 3] - temp3; + data[i + 3] += temp3; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(ComplexScalar* data, Index n, Index n_power_of_2) { + eigen_assert(isPowerOfTwo(n)); + if (n > 8) { + compute_1D_Butterfly(data, n / 2, n_power_of_2 - 1); + compute_1D_Butterfly(data + n / 2, n / 2, n_power_of_2 - 1); + butterfly_1D_merge(data, n, n_power_of_2); + } else if (n == 8) { + butterfly_8(data); + } else if (n == 4) { + butterfly_4(data); + } else if (n == 2) { + butterfly_2(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const { + Index result = 0; + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > omitted_dim; --i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } else { + for (Index i = 0; i < omitted_dim; ++i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } + // Value of index_coords[omitted_dim] is not determined to this step + return result; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const { + Index result = base + offset * m_strides[omitted_dim]; + return result; + } + + protected: + Index m_size; + const FFT EIGEN_DEVICE_REF m_fft; + Dimensions m_dimensions; + array m_strides; + TensorEvaluator m_impl; + EvaluatorPointerType m_data; + const Device EIGEN_DEVICE_REF m_device; + + // This will support a maximum FFT size of 2^32 for each dimension + // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; + const RealScalar m_sin_PI_div_n_LUT[32] = {RealScalar(0.0), + RealScalar(-2), + RealScalar(-0.999999999999999), + RealScalar(-0.292893218813453), + RealScalar(-0.0761204674887130), + RealScalar(-0.0192147195967696), + RealScalar(-0.00481527332780311), + RealScalar(-0.00120454379482761), + RealScalar(-3.01181303795779e-04), + RealScalar(-7.52981608554592e-05), + RealScalar(-1.88247173988574e-05), + RealScalar(-4.70619042382852e-06), + RealScalar(-1.17654829809007e-06), + RealScalar(-2.94137117780840e-07), + RealScalar(-7.35342821488550e-08), + RealScalar(-1.83835707061916e-08), + RealScalar(-4.59589268710903e-09), + RealScalar(-1.14897317243732e-09), + RealScalar(-2.87243293150586e-10), + RealScalar(-7.18108232902250e-11), + RealScalar(-1.79527058227174e-11), + RealScalar(-4.48817645568941e-12), + RealScalar(-1.12204411392298e-12), + RealScalar(-2.80511028480785e-13), + RealScalar(-7.01277571201985e-14), + RealScalar(-1.75319392800498e-14), + RealScalar(-4.38298482001247e-15), + RealScalar(-1.09574620500312e-15), + RealScalar(-2.73936551250781e-16), + RealScalar(-6.84841378126949e-17), + RealScalar(-1.71210344531737e-17), + RealScalar(-4.28025861329343e-18)}; + + // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); + const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {RealScalar(0.0), + RealScalar(0.0), + RealScalar(-1.00000000000000e+00), + RealScalar(-7.07106781186547e-01), + RealScalar(-3.82683432365090e-01), + RealScalar(-1.95090322016128e-01), + RealScalar(-9.80171403295606e-02), + RealScalar(-4.90676743274180e-02), + RealScalar(-2.45412285229123e-02), + RealScalar(-1.22715382857199e-02), + RealScalar(-6.13588464915448e-03), + RealScalar(-3.06795676296598e-03), + RealScalar(-1.53398018628477e-03), + RealScalar(-7.66990318742704e-04), + RealScalar(-3.83495187571396e-04), + RealScalar(-1.91747597310703e-04), + RealScalar(-9.58737990959773e-05), + RealScalar(-4.79368996030669e-05), + RealScalar(-2.39684498084182e-05), + RealScalar(-1.19842249050697e-05), + RealScalar(-5.99211245264243e-06), + RealScalar(-2.99605622633466e-06), + RealScalar(-1.49802811316901e-06), + RealScalar(-7.49014056584716e-07), + RealScalar(-3.74507028292384e-07), + RealScalar(-1.87253514146195e-07), + RealScalar(-9.36267570730981e-08), + RealScalar(-4.68133785365491e-08), + RealScalar(-2.34066892682746e-08), + RealScalar(-1.17033446341373e-08), + RealScalar(-5.85167231706864e-09), + RealScalar(-2.92583615853432e-09)}; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h new file mode 100644 index 00000000000..7f229cb063d --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -0,0 +1,226 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H +#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorFixedSize + * \ingroup CXX11_Tensor_Module + * + * \brief The fixed sized version of the tensor class. + * + * The fixed sized equivalent of + * Eigen::Tensor t(3, 5, 7); + * is + * Eigen::TensorFixedSize> t; + */ + +template +class TensorFixedSize : public TensorBase > { + public: + typedef TensorFixedSize Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + static constexpr int Options = Options_; + static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + + enum { + IsAligned = bool(EIGEN_MAX_ALIGN_BYTES > 0), + PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = true, + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + typedef Dimensions_ Dimensions; + static constexpr std::size_t NumIndices = Dimensions::count; + + protected: + TensorStorage m_storage; + + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, otherIndices...}}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, otherIndices...}}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, otherIndices...}}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, otherIndices...}}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize() : m_storage() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) : Base(other), m_storage(other.m_storage) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) : m_storage(other.m_storage) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize) + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool checkIndexRange(const array& /*indices*/) const { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::lesser_op; + using internal::logical_and_op; + + return true; + // check whether the indices are all >= 0 + /* array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions());*/ + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const { + if (Options & RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h new file mode 100644 index 00000000000..d0fbfb38a89 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -0,0 +1,233 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits> { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorForcedEvalOp& type; +}; + +template +struct nested, 1, typename eval>::type> { + typedef TensorForcedEvalOp type; +}; + +} // end namespace internal + +template +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +namespace internal { +template +struct non_integral_type_placement_new { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) { + // Initialize non-trivially constructible types. + if (!internal::is_arithmetic::value) { + for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType(); + } + } +}; + +// SYCL does not support non-integral types +// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices +// no matching function for call to 'operator new' +template +struct non_integral_type_placement_new { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {} +}; +} // end namespace internal + +template +class DeviceTempPointerHolder { + public: + DeviceTempPointerHolder(const Device& device, size_t size) + : device_(device), size_(size), ptr_(device.allocate_temp(size)) {} + + ~DeviceTempPointerHolder() { + device_.deallocate_temp(ptr_); + size_ = 0; + ptr_ = nullptr; + } + + void* ptr() { return ptr_; } + + private: + Device device_; + size_t size_; + void* ptr_; +}; + +template +struct TensorEvaluator, Device> { + typedef const internal::remove_all_t ArgType; + typedef TensorForcedEvalOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + PreferBlockAccess = false, + RawAccess = true + }; + + static constexpr int Layout = TensorEvaluator::Layout; + static constexpr int NumDims = internal::traits::NumDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), + m_op(op.expression()), + m_device(device), + m_buffer_holder(nullptr), + m_buffer(nullptr) {} + + ~TensorEvaluator() { cleanup(); } + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + const Index numValues = internal::array_prod(m_impl.dimensions()); + m_buffer_holder = std::make_shared>(m_device, numValues * sizeof(CoeffReturnType)); + m_buffer = static_cast(m_buffer_holder->ptr()); + + internal::non_integral_type_placement_new()(numValues, m_buffer); + + typedef TensorEvalToOp> EvalTo; + EvalTo evalToTmp(m_device.get(m_buffer), m_op); + + internal::TensorExecutor, + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>::run(evalToTmp, m_device); + + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + const Index numValues = internal::array_prod(m_impl.dimensions()); + m_buffer_holder = std::make_shared>(m_device, numValues * sizeof(CoeffReturnType)); + m_buffer = static_cast(m_buffer_holder->ptr()); + + typedef TensorEvalToOp> EvalTo; + EvalTo evalToTmp(m_device.get(m_buffer), m_op); + + auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); }, std::move(done)); + internal::TensorAsyncExecutor< + const EvalTo, std::remove_const_t, decltype(on_done), + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>::runAsync(evalToTmp, m_device, + std::move(on_done)); + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { + m_buffer_holder = nullptr; + m_buffer = nullptr; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_buffer + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_buffer != nullptr); + return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EvaluatorPointerType data() const { return m_buffer; } + + private: + TensorEvaluator m_impl; + const ArgType m_op; + const Device EIGEN_DEVICE_REF m_device; + std::shared_ptr> m_buffer_holder; + EvaluatorPointerType m_buffer; // Cached copy of the value stored in m_buffer_holder. +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h new file mode 100644 index 00000000000..49c20a4e285 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -0,0 +1,215 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +// MakePointer class is used as a container of the address space of the pointer +// on the host and on the device. From the host side it generates the T* pointer +// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to +// T* m_data on the host. It is always called on the device. +// Specialisation of MakePointer class for creating the sycl buffer with +// map_allocator. +template +struct MakePointer { + typedef T* Type; + typedef const T* ConstType; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) { + return const_cast(data); +} + +// The StorageMemory class is a container of the device specific pointer +// used for referring to a Pointer on TensorEvaluator class. While the TensorExpression +// is a device-agnostic type and need MakePointer class for type conversion, +// the TensorEvaluator class can be specialized for a device, hence it is possible +// to construct different types of temporary storage memory in TensorEvaluator +// for different devices by specializing the following StorageMemory class. +template +struct StorageMemory : MakePointer {}; + +namespace internal { +template +struct Pointer_type_promotion { + static const bool val = false; +}; +template +struct Pointer_type_promotion { + static const bool val = true; +}; +template +struct TypeConversion { + typedef A* type; +}; +} // namespace internal + +template class MakePointer_ = MakePointer> +class TensorMap; +template +class Tensor; +template +class TensorFixedSize; +template +class TensorRef; +template +class TensorBase; + +template +class TensorCwiseNullaryOp; +template +class TensorCwiseUnaryOp; +template +class TensorCwiseBinaryOp; +template +class TensorCwiseTernaryOp; +template +class TensorSelectOp; +template class MakePointer_ = MakePointer> +class TensorReductionOp; +template +class TensorIndexPairOp; +template +class TensorPairReducerOp; +template +class TensorConcatenationOp; +template +class TensorContractionOp; +template +class TensorConversionOp; +template +class TensorConvolutionOp; +template +class TensorFFTOp; +template +class TensorPatchOp; +template +class TensorImagePatchOp; +template +class TensorVolumePatchOp; +template +class TensorBroadcastingOp; +template +class TensorChippingOp; +template +class TensorReshapingOp; +template +class TensorLayoutSwapOp; +template +class TensorSlicingOp; +template +class TensorReverseOp; +template +class TensorRollOp; +template +class TensorPaddingOp; +template +class TensorShufflingOp; +template +class TensorStridingOp; +template +class TensorStridingSlicingOp; +template +class TensorInflationOp; +template +class TensorGeneratorOp; +template +class TensorAssignOp; +template +class TensorScanOp; +template +class TensorTraceOp; + +template +class TensorCustomUnaryOp; +template +class TensorCustomBinaryOp; + +template class MakePointer_ = MakePointer> +class TensorEvalToOp; +template +class TensorForcedEvalOp; + +template +class TensorDevice; +template +class TensorAsyncDevice; +template +struct TensorEvaluator; + +struct NoOpOutputKernel; + +struct DefaultDevice; +struct ThreadPoolDevice; +struct GpuDevice; +struct SyclDevice; + +#ifdef EIGEN_USE_SYCL +namespace TensorSycl { +namespace internal { +template +class GenericNondeterministicReducer; +} +} // namespace TensorSycl +#endif + +enum FFTResultType { RealPart = 0, ImagPart = 1, BothParts = 2 }; + +enum FFTDirection { FFT_FORWARD = 0, FFT_REVERSE = 1 }; + +namespace internal { + +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess; +}; + +template +struct IsVectorizable { + static const bool value = + TensorEvaluator::PacketAccess && TensorEvaluator::IsAligned; +}; + +// Tiled evaluation strategy. +enum TiledEvaluation { + Off = 0, // tiled evaluation is not supported + On = 1, // still work in progress (see TensorBlock.h) +}; + +template +struct IsTileable { + // Check that block evaluation is supported and it's a preferred option (at + // least one sub-expression has much faster block evaluation, e.g. + // broadcasting). + static constexpr bool BlockAccess = + TensorEvaluator::BlockAccess && TensorEvaluator::PreferBlockAccess; + + static const TiledEvaluation value = BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off; +}; + +template ::value, + TiledEvaluation Tiling = IsTileable::value> +class TensorExecutor; + +template ::value, + TiledEvaluation Tiling = IsTileable::value> +class TensorAsyncExecutor; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h new file mode 100644 index 00000000000..7a87594c525 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -0,0 +1,422 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/** \internal + * \brief Template functor to compute the modulo between an array and a scalar. + */ +template +struct scalar_mod_op { + EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return a % m_divisor; } + const Scalar m_divisor; +}; +template +struct functor_traits > { + enum { Cost = scalar_div_cost::value, PacketAccess = false }; +}; + +/** \internal + * \brief Template functor to compute the modulo between 2 arrays. + */ +template +struct scalar_mod2_op { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { return a % b; } +}; +template +struct functor_traits > { + enum { Cost = scalar_div_cost::value, PacketAccess = false }; +}; + +template +struct scalar_fmod_op { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { + return numext::fmod(a, b); + } +}; +template +struct functor_traits > { + enum { + Cost = 13, // Reciprocal throughput of FPREM on Haswell. + PacketAccess = false + }; +}; + +template +struct reducer_traits { + enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true }; +}; + +// Standard reduction functors +template +struct SumReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + internal::scalar_sum_op sum_op; + *accum = sum_op(*accum, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = padd(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_sum_op sum_op; + return sum_op(saccum, predux(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasAdd, + IsStateful = false, + IsExactlyAssociative = NumTraits::IsInteger + }; +}; + +template +struct MeanReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MeanReducer() : scalarCount_(0), packetCount_(0) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + internal::scalar_sum_op sum_op; + *accum = sum_op(*accum, t); + scalarCount_++; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + (*accum) = padd(*accum, p); + packetCount_++; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + internal::scalar_quotient_op quotient_op; + return quotient_op(accum, T(scalarCount_)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, pset1(T(packetCount_))); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_sum_op sum_op; + internal::scalar_quotient_op quotient_op; + return quotient_op(sum_op(saccum, predux(vaccum)), T(scalarCount_ + packetCount_ * unpacket_traits::size)); + } + + protected: + DenseIndex scalarCount_; + DenseIndex packetCount_; +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasAdd && PacketType::HasDiv && !NumTraits::IsInteger, + IsStateful = true, + IsExactlyAssociative = NumTraits::IsInteger + }; +}; + +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits::lowest(); } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return -Eigen::NumTraits::infinity(); } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits::highest(); } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits::infinity(); } +}; + +template +struct MaxReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + scalar_max_op op; + *accum = op(t, *accum); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + scalar_max_op op; + (*accum) = op.packetOp(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return MinMaxBottomValue::IsInteger>::bottom_value(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + scalar_max_op op; + return op(saccum, op.predux(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasMax, + IsStateful = false, + IsExactlyAssociative = (NaNPropagation != PropagateFast) + }; +}; + +template +struct MinReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + scalar_min_op op; + *accum = op(t, *accum); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + scalar_min_op op; + (*accum) = op.packetOp(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return MinMaxBottomValue::IsInteger>::bottom_value(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + scalar_min_op op; + return op(saccum, op.predux(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasMin, + IsStateful = false, + IsExactlyAssociative = (NaNPropagation != PropagateFast) + }; +}; + +template +struct ProdReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + internal::scalar_product_op prod_op; + (*accum) = prod_op(*accum, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmul(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(1); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_product_op prod_op; + return prod_op(saccum, predux_mul(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::MulCost, + PacketAccess = PacketType::HasMul, + IsStateful = false, + IsExactlyAssociative = true + }; +}; + +struct AndReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum && t; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { return accum; } +}; + +template +struct reducer_traits { + enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true }; +}; + +struct OrReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum || t; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { return false; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { return accum; } +}; + +template +struct reducer_traits { + enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true }; +}; + +// Argmin/Argmax reducers. Returns the first occurrence if multiple locations +// contain the same min/max value. +template +struct ArgMaxPairReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t.second < accum->second) { + return; + } else if (t.second > accum->second || accum->first > t.first) { + *accum = t; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits::lowest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { return accum; } +}; + +template +struct reducer_traits, Device> { + enum { Cost = NumTraits::AddCost, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true }; +}; + +template +struct ArgMinPairReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { + if (t.second > accum->second) { + return; + } else if (t.second < accum->second || accum->first > t.first) { + *accum = t; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits::highest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { return accum; } +}; + +template +struct reducer_traits, Device> { + enum { Cost = NumTraits::AddCost, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true }; +}; + +template +class GaussianGenerator { + public: + static constexpr bool PacketAccess = false; + + EIGEN_DEVICE_FUNC GaussianGenerator(const array& means, const array& std_devs) + : m_means(means) { + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < NumDims; ++i) { + m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; + } + } + + EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { + T tmp = T(0); + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < NumDims; ++i) { + T offset = coordinates[i] - m_means[i]; + tmp += offset * offset / m_two_sigmas[i]; + } + return numext::exp(-tmp); + } + + private: + array m_means; + array m_two_sigmas; +}; + +template +struct functor_traits > { + enum { + Cost = NumDims * + (2 * NumTraits::AddCost + NumTraits::MulCost + functor_traits >::Cost) + + functor_traits >::Cost, + PacketAccess = GaussianGenerator::PacketAccess + }; +}; + +template +struct scalar_clamp_op { + EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + return numext::mini(numext::maxi(x, m_min), m_max); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x) const { + return internal::pmin(internal::pmax(x, pset1(m_min)), pset1(m_max)); + } + const Scalar m_min; + const Scalar m_max; +}; +template +struct functor_traits > { + enum { + Cost = 2 * NumTraits::AddCost, + PacketAccess = (packet_traits::HasMin && packet_traits::HasMax) + }; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h new file mode 100644 index 00000000000..b038eaf787a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -0,0 +1,271 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorGeneratorOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor generator class. + * + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorGeneratorOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorGeneratorOp type; +}; + +} // end namespace internal + +template +class TensorGeneratorOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator) + : m_xpr(expr), m_generator(generator) {} + + EIGEN_DEVICE_FUNC const Generator& generator() const { return m_generator; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Generator m_generator; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorGeneratorOp XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator::Dimensions Dimensions; + static constexpr int NumDims = internal::array_size::value; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = true, + PreferBlockAccess = true, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef internal::TensorIntDivisor IndexDivisor; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_generator(op.generator()) { + TensorEvaluator argImpl(op.expression(), device); + m_dimensions = argImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + EIGEN_UNROLL_LOOP + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); + } + } else { + m_strides[NumDims - 1] = 1; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { return true; } + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + array coords; + extract_coordinates(index, coords); + return m_generator(coords); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + const int packetSize = PacketType::size; + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX std::remove_const_t values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.firstLevelCacheSize(); + // TODO(ezhulenev): Generator should have a cost. + return internal::TensorBlockResourceRequirements::skewed(target_size); + } + + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + static const bool is_col_major = static_cast(Layout) == static_cast(ColMajor); + + // Compute spatial coordinates for the first block element. + array coords; + extract_coordinates(desc.offset(), coords); + array initial_coords = coords; + + // Offset in the output block buffer. + Index offset = 0; + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims; ++i) { + const int dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride); + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + eigen_assert(it[0].stride == 1); + + // Prepare storage for the materialized generator result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + + CoeffReturnType* block_buffer = block_storage.data(); + + static const int packet_size = PacketType::size; + + static const int inner_dim = is_col_major ? 0 : NumDims - 1; + const Index inner_dim_size = it[0].size; + const Index inner_dim_vectorized = inner_dim_size - packet_size; + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + Index i = 0; + // Generate data for the vectorized part of the inner-most dimension. + for (; i <= inner_dim_vectorized; i += packet_size) { + for (Index j = 0; j < packet_size; ++j) { + array j_coords = coords; // Break loop dependence. + j_coords[inner_dim] += j; + *(block_buffer + offset + i + j) = m_generator(j_coords); + } + coords[inner_dim] += packet_size; + } + // Finalize non-vectorized part of the inner-most dimension. + for (; i < inner_dim_size; ++i) { + *(block_buffer + offset + i) = m_generator(coords); + coords[inner_dim]++; + } + coords[inner_dim] = initial_coords[inner_dim]; + + // For the 1d tensor we need to generate only one inner-most dimension. + if (NumDims == 1) break; + + // Update offset. + for (i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + // TODO(rmlarsen): This is just a placeholder. Define interface to make + // generators return their cost. + return TensorOpCost(0, 0, TensorOpCost::AddCost() + TensorOpCost::MulCost()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array& coords) const { + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[NumDims - 1] = index; + } + } + + const Device EIGEN_DEVICE_REF m_device; + Dimensions m_dimensions; + array m_strides; + array m_fast_strides; + Generator m_generator; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h new file mode 100644 index 00000000000..6a1240cfacd --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h @@ -0,0 +1,36 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors. + * + * This function computes the regularized incomplete beta function (integral). + * + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseTernaryOp, + const ADerived, const BDerived, const XDerived> +betainc(const Eigen::TensorBase& a, + const Eigen::TensorBase& b, + const Eigen::TensorBase& x) { + return TensorCwiseTernaryOp, const ADerived, const BDerived, + const XDerived>(a.derived(), b.derived(), x.derived(), + internal::scalar_betainc_op()); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h new file mode 100644 index 00000000000..3073272a24f --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -0,0 +1,101 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) +#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design +// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but +// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler +// When compiling such files, gcc will end up trying to pick up the CUDA headers by +// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU) +// This will obviously not work when trying to compile tensorflow on a system with no CUDA +// To work around this issue for HIP systems (and leave the default behaviour intact), the +// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and +// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is +// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well + +#if defined(EIGEN_USE_HIP) + +#define gpuStream_t hipStream_t +#define gpuDeviceProp_t hipDeviceProp_t +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess +#define gpuErrorNotReady hipErrorNotReady +#define gpuGetDeviceCount hipGetDeviceCount +#define gpuGetLastError hipGetLastError +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorName hipGetErrorName +#define gpuGetErrorString hipGetErrorString +#define gpuGetDeviceProperties hipGetDeviceProperties +#define gpuStreamDefault hipStreamDefault +#define gpuGetDevice hipGetDevice +#define gpuSetDevice hipSetDevice +#define gpuMalloc hipMalloc +#define gpuFree hipFree +#define gpuMemsetAsync hipMemsetAsync +#define gpuMemset2DAsync hipMemset2DAsync +#define gpuMemcpyAsync hipMemcpyAsync +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuStreamQuery hipStreamQuery +#define gpuSharedMemConfig hipSharedMemConfig +#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig +#define gpuStreamSynchronize hipStreamSynchronize +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuMemcpy hipMemcpy + +#else + +#define gpuStream_t cudaStream_t +#define gpuDeviceProp_t cudaDeviceProp +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess +#define gpuErrorNotReady cudaErrorNotReady +#define gpuGetDeviceCount cudaGetDeviceCount +#define gpuGetLastError cudaGetLastError +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorName cudaGetErrorName +#define gpuGetErrorString cudaGetErrorString +#define gpuGetDeviceProperties cudaGetDeviceProperties +#define gpuStreamDefault cudaStreamDefault +#define gpuGetDevice cudaGetDevice +#define gpuSetDevice cudaSetDevice +#define gpuMalloc cudaMalloc +#define gpuFree cudaFree +#define gpuMemsetAsync cudaMemsetAsync +#define gpuMemset2DAsync cudaMemset2DAsync +#define gpuMemcpyAsync cudaMemcpyAsync +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuStreamQuery cudaStreamQuery +#define gpuSharedMemConfig cudaSharedMemConfig +#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig +#define gpuStreamSynchronize cudaStreamSynchronize +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuMemcpy cudaMemcpy + +#endif + +// gpu_assert can be overridden +#ifndef gpu_assert + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +// HIPCC do not support the use of assert on the GPU side. +#define gpu_assert(COND) +#else +#define gpu_assert(COND) eigen_assert(COND) +#endif + +#endif // gpu_assert + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h new file mode 100644 index 00000000000..509bcee0b87 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h @@ -0,0 +1,45 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) + +#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + +#undef gpuStream_t +#undef gpuDeviceProp_t +#undef gpuError_t +#undef gpuSuccess +#undef gpuErrorNotReady +#undef gpuGetDeviceCount +#undef gpuGetErrorString +#undef gpuGetDeviceProperties +#undef gpuStreamDefault +#undef gpuGetDevice +#undef gpuSetDevice +#undef gpuMalloc +#undef gpuFree +#undef gpuMemsetAsync +#undef gpuMemset2DAsync +#undef gpuMemcpyAsync +#undef gpuMemcpyDeviceToDevice +#undef gpuMemcpyDeviceToHost +#undef gpuMemcpyHostToDevice +#undef gpuStreamQuery +#undef gpuSharedMemConfig +#undef gpuDeviceSetSharedMemConfig +#undef gpuStreamSynchronize +#undef gpuDeviceSynchronize +#undef gpuMemcpy + +#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + +#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h new file mode 100644 index 00000000000..0bdb1ab7985 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -0,0 +1,413 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H +#define EIGEN_CXX11_TENSOR_TENSOR_IO_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +struct TensorIOFormat; + +namespace internal { +template +struct TensorPrinter; +} + +template +struct TensorIOFormatBase { + using Derived = Derived_; + TensorIOFormatBase(const std::vector& separator, const std::vector& prefix, + const std::vector& suffix, int precision = StreamPrecision, int flags = 0, + const std::string& tenPrefix = "", const std::string& tenSuffix = "", const char fill = ' ') + : tenPrefix(tenPrefix), + tenSuffix(tenSuffix), + prefix(prefix), + suffix(suffix), + separator(separator), + fill(fill), + precision(precision), + flags(flags) { + init_spacer(); + } + + void init_spacer() { + if ((flags & DontAlignCols)) return; + spacer.resize(prefix.size()); + spacer[0] = ""; + int i = int(tenPrefix.length()) - 1; + while (i >= 0 && tenPrefix[i] != '\n') { + spacer[0] += ' '; + i--; + } + + for (std::size_t k = 1; k < prefix.size(); k++) { + int j = int(prefix[k].length()) - 1; + while (j >= 0 && prefix[k][j] != '\n') { + spacer[k] += ' '; + j--; + } + } + } + + std::string tenPrefix; + std::string tenSuffix; + std::vector prefix; + std::vector suffix; + std::vector separator; + char fill; + int precision; + int flags; + std::vector spacer{}; +}; + +struct TensorIOFormatNumpy : public TensorIOFormatBase { + using Base = TensorIOFormatBase; + TensorIOFormatNumpy() + : Base(/*separator=*/{" ", "\n"}, /*prefix=*/{"", "["}, /*suffix=*/{"", "]"}, /*precision=*/StreamPrecision, + /*flags=*/0, /*tenPrefix=*/"[", /*tenSuffix=*/"]") {} +}; + +struct TensorIOFormatNative : public TensorIOFormatBase { + using Base = TensorIOFormatBase; + TensorIOFormatNative() + : Base(/*separator=*/{", ", ",\n", "\n"}, /*prefix=*/{"", "{"}, /*suffix=*/{"", "}"}, + /*precision=*/StreamPrecision, /*flags=*/0, /*tenPrefix=*/"{", /*tenSuffix=*/"}") {} +}; + +struct TensorIOFormatPlain : public TensorIOFormatBase { + using Base = TensorIOFormatBase; + TensorIOFormatPlain() + : Base(/*separator=*/{" ", "\n", "\n", ""}, /*prefix=*/{""}, /*suffix=*/{""}, /*precision=*/StreamPrecision, + /*flags=*/0, /*tenPrefix=*/"", /*tenSuffix=*/"") {} +}; + +struct TensorIOFormatLegacy : public TensorIOFormatBase { + using Base = TensorIOFormatBase; + TensorIOFormatLegacy() + : Base(/*separator=*/{", ", "\n"}, /*prefix=*/{"", "["}, /*suffix=*/{"", "]"}, /*precision=*/StreamPrecision, + /*flags=*/0, /*tenPrefix=*/"", /*tenSuffix=*/"") {} +}; + +struct TensorIOFormat : public TensorIOFormatBase { + using Base = TensorIOFormatBase; + TensorIOFormat(const std::vector& separator, const std::vector& prefix, + const std::vector& suffix, int precision = StreamPrecision, int flags = 0, + const std::string& tenPrefix = "", const std::string& tenSuffix = "", const char fill = ' ') + : Base(separator, prefix, suffix, precision, flags, tenPrefix, tenSuffix, fill) {} + + static inline const TensorIOFormatNumpy Numpy() { return TensorIOFormatNumpy{}; } + + static inline const TensorIOFormatPlain Plain() { return TensorIOFormatPlain{}; } + + static inline const TensorIOFormatNative Native() { return TensorIOFormatNative{}; } + + static inline const TensorIOFormatLegacy Legacy() { return TensorIOFormatLegacy{}; } +}; + +template +class TensorWithFormat; +// specialize for Layout=ColMajor, Layout=RowMajor and rank=0. +template +class TensorWithFormat { + public: + TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {} + + friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat& wf) { + // Evaluate the expression if needed + typedef TensorEvaluator, DefaultDevice> Evaluator; + TensorForcedEvalOp eval = wf.t_tensor.eval(); + Evaluator tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + internal::TensorPrinter::run(os, tensor, wf.t_format); + // Cleanup. + tensor.cleanup(); + return os; + } + + protected: + T t_tensor; + Format t_format; +}; + +template +class TensorWithFormat { + public: + TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {} + + friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat& wf) { + // Switch to RowMajor storage and print afterwards + typedef typename T::Index IndexType; + std::array shuffle; + std::array id; + std::iota(id.begin(), id.end(), IndexType(0)); + std::copy(id.begin(), id.end(), shuffle.rbegin()); + auto tensor_row_major = wf.t_tensor.swap_layout().shuffle(shuffle); + + // Evaluate the expression if needed + typedef TensorEvaluator, DefaultDevice> Evaluator; + TensorForcedEvalOp eval = tensor_row_major.eval(); + Evaluator tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + internal::TensorPrinter::run(os, tensor, wf.t_format); + // Cleanup. + tensor.cleanup(); + return os; + } + + protected: + T t_tensor; + Format t_format; +}; + +template +class TensorWithFormat { + public: + TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {} + + friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat& wf) { + // Evaluate the expression if needed + typedef TensorEvaluator, DefaultDevice> Evaluator; + TensorForcedEvalOp eval = wf.t_tensor.eval(); + Evaluator tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + internal::TensorPrinter::run(os, tensor, wf.t_format); + // Cleanup. + tensor.cleanup(); + return os; + } + + protected: + T t_tensor; + Format t_format; +}; + +namespace internal { + +// Default scalar printer. +template +struct ScalarPrinter { + static void run(std::ostream& stream, const Scalar& scalar, const Format&) { stream << scalar; } +}; + +template +struct ScalarPrinter::IsComplex>> { + static void run(std::ostream& stream, const Scalar& scalar, const TensorIOFormatNumpy&) { + stream << numext::real(scalar) << "+" << numext::imag(scalar) << "j"; + } +}; + +template +struct ScalarPrinter::IsComplex>> { + static void run(std::ostream& stream, const Scalar& scalar, const TensorIOFormatNative&) { + stream << "{" << numext::real(scalar) << ", " << numext::imag(scalar) << "}"; + } +}; + +template +struct TensorPrinter { + using Scalar = std::remove_const_t; + + static void run(std::ostream& s, const Tensor& tensor, const Format& fmt) { + typedef typename Tensor::Index IndexType; + + eigen_assert(Tensor::Layout == RowMajor); + typedef std::conditional_t::value || is_same::value || + is_same::value || is_same::value, + int, + std::conditional_t>::value || + is_same>::value || + is_same>::value || + is_same>::value, + std::complex, const Scalar&>> + PrintType; + + const IndexType total_size = array_prod(tensor.dimensions()); + + std::streamsize explicit_precision; + if (fmt.precision == StreamPrecision) { + explicit_precision = 0; + } else if (fmt.precision == FullPrecision) { + if (NumTraits::IsInteger) { + explicit_precision = 0; + } else { + explicit_precision = significant_decimals_impl::run(); + } + } else { + explicit_precision = fmt.precision; + } + + std::streamsize old_precision = 0; + if (explicit_precision) old_precision = s.precision(explicit_precision); + + IndexType width = 0; + bool align_cols = !(fmt.flags & DontAlignCols); + if (align_cols) { + // compute the largest width + for (IndexType i = 0; i < total_size; i++) { + std::stringstream sstr; + sstr.copyfmt(s); + ScalarPrinter::run(sstr, static_cast(tensor.data()[i]), fmt); + width = std::max(width, IndexType(sstr.str().length())); + } + } + s << fmt.tenPrefix; + for (IndexType i = 0; i < total_size; i++) { + std::array is_at_end{}; + std::array is_at_begin{}; + + // is the ith element the end of an coeff (always true), of a row, of a matrix, ...? + for (std::size_t k = 0; k < rank; k++) { + if ((i + 1) % (std::accumulate(tensor.dimensions().rbegin(), tensor.dimensions().rbegin() + k, 1, + std::multiplies())) == + 0) { + is_at_end[k] = true; + } + } + + // is the ith element the begin of an coeff (always true), of a row, of a matrix, ...? + for (std::size_t k = 0; k < rank; k++) { + if (i % (std::accumulate(tensor.dimensions().rbegin(), tensor.dimensions().rbegin() + k, 1, + std::multiplies())) == + 0) { + is_at_begin[k] = true; + } + } + + // do we have a line break? + bool is_at_begin_after_newline = false; + for (std::size_t k = 0; k < rank; k++) { + if (is_at_begin[k]) { + std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1; + if (fmt.separator[separator_index].find('\n') != std::string::npos) { + is_at_begin_after_newline = true; + } + } + } + + bool is_at_end_before_newline = false; + for (std::size_t k = 0; k < rank; k++) { + if (is_at_end[k]) { + std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1; + if (fmt.separator[separator_index].find('\n') != std::string::npos) { + is_at_end_before_newline = true; + } + } + } + + std::stringstream suffix, prefix, separator; + for (std::size_t k = 0; k < rank; k++) { + std::size_t suffix_index = (k < fmt.suffix.size()) ? k : fmt.suffix.size() - 1; + if (is_at_end[k]) { + suffix << fmt.suffix[suffix_index]; + } + } + for (std::size_t k = 0; k < rank; k++) { + std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1; + if (is_at_end[k] && + (!is_at_end_before_newline || fmt.separator[separator_index].find('\n') != std::string::npos)) { + separator << fmt.separator[separator_index]; + } + } + for (std::size_t k = 0; k < rank; k++) { + std::size_t spacer_index = (k < fmt.spacer.size()) ? k : fmt.spacer.size() - 1; + if (i != 0 && is_at_begin_after_newline && (!is_at_begin[k] || k == 0)) { + prefix << fmt.spacer[spacer_index]; + } + } + for (int k = rank - 1; k >= 0; k--) { + std::size_t prefix_index = (static_cast(k) < fmt.prefix.size()) ? k : fmt.prefix.size() - 1; + if (is_at_begin[k]) { + prefix << fmt.prefix[prefix_index]; + } + } + + s << prefix.str(); + // So we don't mess around with formatting, output scalar to a string stream, and adjust the width/fill manually. + std::stringstream sstr; + sstr.copyfmt(s); + ScalarPrinter::run(sstr, static_cast(tensor.data()[i]), fmt); + std::string scalar_str = sstr.str(); + IndexType scalar_width = scalar_str.length(); + if (width && scalar_width < width) { + std::string filler; + for (IndexType j = scalar_width; j < width; ++j) { + filler.push_back(fmt.fill); + } + s << filler; + } + s << scalar_str; + s << suffix.str(); + if (i < total_size - 1) { + s << separator.str(); + } + } + s << fmt.tenSuffix; + if (explicit_precision) s.precision(old_precision); + } +}; + +template +struct TensorPrinter> { + using Format = TensorIOFormatLegacy; + using Scalar = std::remove_const_t; + + static void run(std::ostream& s, const Tensor& tensor, const Format&) { + typedef typename Tensor::Index IndexType; + // backwards compatibility case: print tensor after reshaping to matrix of size dim(0) x + // (dim(1)*dim(2)*...*dim(rank-1)). + const IndexType total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + const IndexType first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); + Map> matrix(tensor.data(), first_dim, + total_size / first_dim); + s << matrix; + return; + } + } +}; + +template +struct TensorPrinter { + static void run(std::ostream& s, const Tensor& tensor, const Format& fmt) { + using Scalar = std::remove_const_t; + + std::streamsize explicit_precision; + if (fmt.precision == StreamPrecision) { + explicit_precision = 0; + } else if (fmt.precision == FullPrecision) { + if (NumTraits::IsInteger) { + explicit_precision = 0; + } else { + explicit_precision = significant_decimals_impl::run(); + } + } else { + explicit_precision = fmt.precision; + } + + std::streamsize old_precision = 0; + if (explicit_precision) old_precision = s.precision(explicit_precision); + s << fmt.tenPrefix; + ScalarPrinter::run(s, tensor.coeff(0), fmt); + s << fmt.tenSuffix; + if (explicit_precision) s.precision(old_precision); + } +}; + +} // end namespace internal +template +std::ostream& operator<<(std::ostream& s, const TensorBase& t) { + s << t.format(TensorIOFormat::Plain()); + return s; +} +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h new file mode 100644 index 00000000000..291301a8193 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -0,0 +1,590 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorImagePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for image processing. + * This assumes that the input has a least 3 dimensions ordered as follow: + * 1st dimension: channels (of size d) + * 2nd dimension: rows (of size r) + * 3rd dimension: columns (of size c) + * There can be additional dimensions such as time (for video) or batch (for + * bulk processing after the first 3. + * Calling the image patch code with patch_rows and patch_cols is equivalent + * to calling the regular patch extraction code with parameters d, patch_rows, + * patch_cols, and 1 for all the additional dimensions. + */ +namespace internal { + +template +struct traits > : public traits { + typedef std::remove_const_t Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions + 1; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorImagePatchOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorImagePatchOp type; +}; + +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Self& self, const Index num_coeff_to_copy, + const Index dst_index, Scalar* dst_data, + const Index src_index) { + const Impl& impl = self.impl(); + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Self& self, const Index num_coeff_to_copy, + const Index dst_index, Scalar* dst_data, + const Index src_index) { + const Impl& impl = self.impl(); + const Index packet_size = internal::unpacket_traits::size; + const Index vectorized_size = (num_coeff_to_copy / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = impl.template packet(src_index + i); + internal::pstoret(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchPaddingOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Index num_coeff_to_pad, const Scalar padding_value, + const Index dst_index, Scalar* dst_data) { + const Index packet_size = internal::unpacket_traits::size; + const Packet padded_packet = internal::pset1(padding_value); + const Index vectorized_size = (num_coeff_to_pad / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + internal::pstoret(dst_data + dst_index + i, padded_packet); + } + for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) { + dst_data[dst_index + i] = padding_value; + } + } +}; + +} // end namespace internal + +template +class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, + DenseIndex patch_cols, DenseIndex row_strides, + DenseIndex col_strides, DenseIndex in_row_strides, + DenseIndex in_col_strides, DenseIndex row_inflate_strides, + DenseIndex col_inflate_strides, PaddingType padding_type, + Scalar padding_value) + : m_xpr(expr), + m_patch_rows(patch_rows), + m_patch_cols(patch_cols), + m_row_strides(row_strides), + m_col_strides(col_strides), + m_in_row_strides(in_row_strides), + m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), + m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), + m_padding_top(0), + m_padding_bottom(0), + m_padding_left(0), + m_padding_right(0), + m_padding_type(padding_type), + m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, + DenseIndex patch_cols, DenseIndex row_strides, + DenseIndex col_strides, DenseIndex in_row_strides, + DenseIndex in_col_strides, DenseIndex row_inflate_strides, + DenseIndex col_inflate_strides, DenseIndex padding_top, + DenseIndex padding_bottom, DenseIndex padding_left, + DenseIndex padding_right, Scalar padding_value) + : m_xpr(expr), + m_patch_rows(patch_rows), + m_patch_cols(patch_cols), + m_row_strides(row_strides), + m_col_strides(col_strides), + m_in_row_strides(in_row_strides), + m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), + m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), + m_padding_top(padding_top), + m_padding_bottom(padding_bottom), + m_padding_left(padding_left), + m_padding_right(padding_right), + m_padding_type(PADDING_VALID), + m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC DenseIndex in_row_strides() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC DenseIndex in_col_strides() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC bool padding_explicit() const { return m_padding_explicit; } + EIGEN_DEVICE_FUNC DenseIndex padding_top() const { return m_padding_top; } + EIGEN_DEVICE_FUNC DenseIndex padding_bottom() const { return m_padding_bottom; } + EIGEN_DEVICE_FUNC DenseIndex padding_left() const { return m_padding_left; } + EIGEN_DEVICE_FUNC DenseIndex padding_right() const { return m_padding_right; } + EIGEN_DEVICE_FUNC PaddingType padding_type() const { return m_padding_type; } + EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const DenseIndex m_in_row_strides; + const DenseIndex m_in_col_strides; + const DenseIndex m_row_inflate_strides; + const DenseIndex m_col_inflate_strides; + const bool m_padding_explicit; + const DenseIndex m_padding_top; + const DenseIndex m_padding_bottom; + const DenseIndex m_padding_left; + const DenseIndex m_padding_right; + const PaddingType m_padding_type; + const Scalar m_padding_value; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorImagePatchOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + static constexpr int NumDims = NumInputDims + 1; + typedef DSizes Dimensions; + typedef std::remove_const_t Scalar; + typedef TensorEvaluator, Device> Self; + typedef TensorEvaluator Impl; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = true, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_impl(op.expression(), device) { + EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + + m_paddingValue = op.padding_value(); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // Caches a few variables. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputDepth = input_dims[0]; + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + } else { + m_inputDepth = input_dims[NumInputDims - 1]; + m_inputRows = input_dims[NumInputDims - 2]; + m_inputCols = input_dims[NumInputDims - 3]; + } + + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // Input strides and effective input/patch size + m_in_row_strides = op.in_row_strides(); + m_in_col_strides = op.in_col_strides(); + m_row_inflate_strides = op.row_inflate_strides(); + m_col_inflate_strides = op.col_inflate_strides(); + // The "effective" input rows and input cols are the input rows and cols + // after inflating them with zeros. + // For examples, a 2x3 matrix with row_inflate_strides and + // col_inflate_strides of 2 comes from: + // A B C + // D E F + // + // to a matrix is 3 x 5: + // + // A . B . C + // . . . . . + // D . E . F + + m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; + m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; + m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); + m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); + + if (op.padding_explicit()) { + m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / + static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / + static_cast(m_col_strides)); + m_rowPaddingTop = op.padding_top(); + m_colPaddingLeft = op.padding_left(); + } else { + // Computing padding from the type + switch (op.padding_type()) { + case PADDING_VALID: + m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = + numext::maxi(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2); + m_colPaddingLeft = + numext::maxi(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2); + break; + case PADDING_SAME: + m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); + m_outputCols = numext::ceil(m_input_cols_eff / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + // The padding size calculation for PADDING_SAME has been updated to + // be consistent with how TensorFlow extracts its paddings. + m_rowPaddingTop = numext::maxi(0, m_rowPaddingTop); + m_colPaddingLeft = numext::maxi(0, m_colPaddingLeft); + break; + default: + eigen_assert(false && "unexpected padding"); + m_outputCols = 0; // silence the uninitialised warning; + m_outputRows = 0; //// silence the uninitialised warning; + } + } + eigen_assert(m_outputRows > 0); + eigen_assert(m_outputCols > 0); + + // Dimensions for result of extraction. + if (static_cast(Layout) == static_cast(ColMajor)) { + // ColMajor + // 0: depth + // 1: patch_rows + // 2: patch_cols + // 3: number of patches + // 4 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_rows(); + m_dimensions[2] = op.patch_cols(); + m_dimensions[3] = m_outputRows * m_outputCols; + for (int i = 4; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i - 1]; + } + } else { + // RowMajor + // NumDims-1: depth + // NumDims-2: patch_rows + // NumDims-3: patch_cols + // NumDims-4: number of patches + // NumDims-5 and beyond: anything else (such as batch). + m_dimensions[NumDims - 1] = input_dims[NumInputDims - 1]; + m_dimensions[NumDims - 2] = op.patch_rows(); + m_dimensions[NumDims - 3] = op.patch_cols(); + m_dimensions[NumDims - 4] = m_outputRows * m_outputCols; + for (int i = NumDims - 5; i >= 0; --i) { + m_dimensions[i] = input_dims[i]; + } + } + + // Strides for moving the patch in various dimensions. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_colStride = m_dimensions[1]; + m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[3]; + } else { + m_colStride = m_dimensions[NumDims - 2]; + m_patchStride = m_colStride * m_dimensions[NumDims - 3] * m_dimensions[NumDims - 1]; + m_otherStride = m_patchStride * m_dimensions[NumDims - 4]; + } + + // Strides for navigating through the input tensor. + m_rowInputStride = m_inputDepth; + m_colInputStride = m_inputDepth * m_inputRows; + m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastInflateRowStride = internal::TensorIntDivisor(m_row_inflate_strides); + m_fastInflateColStride = internal::TensorIntDivisor(m_col_inflate_strides); + m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); + + // Number of patches in the width dimension. + m_fastOutputRows = internal::TensorIntDivisor(m_outputRows); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[0]); + } else { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims - 1]); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; + + // Other ways to index this element. + const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + // Calculate col index in the input original tensor. + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = + (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate row index in the original input tensor. + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = + (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + + const Index inputIndex = + depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { + return packetWithPossibleZero(index); + } + + const Index indices[2] = {index, index + PacketSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + + const Index patch2DIndex = + (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft, + colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride, + patchOffsets[1] - colOffsets[1] * m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop, + rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { + // no padding + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + const Index inputIndex = + depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.template packet(inputIndex); + } + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We conservatively estimate the cost for the code path where the computed + // index is inside the original image and + // TensorEvaluator::CoordAccess is false. + const double compute_cost = + 3 * TensorOpCost::DivCost() + 6 * TensorOpCost::MulCost() + 8 * TensorOpCost::MulCost(); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + + Index m_otherStride; + Index m_patchStride; + Index m_colStride; + Index m_row_strides; + Index m_col_strides; + + Index m_in_row_strides; + Index m_in_col_strides; + Index m_row_inflate_strides; + Index m_col_inflate_strides; + + Index m_input_rows_eff; + Index m_input_cols_eff; + Index m_patch_rows_eff; + Index m_patch_cols_eff; + + internal::TensorIntDivisor m_fastOtherStride; + internal::TensorIntDivisor m_fastPatchStride; + internal::TensorIntDivisor m_fastColStride; + internal::TensorIntDivisor m_fastInflateRowStride; + internal::TensorIntDivisor m_fastInflateColStride; + internal::TensorIntDivisor m_fastInputColsEff; + + Index m_rowInputStride; + Index m_colInputStride; + Index m_patchInputStride; + + Index m_inputDepth; + Index m_inputRows; + Index m_inputCols; + + Index m_outputRows; + Index m_outputCols; + + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + internal::TensorIntDivisor m_fastOutputRows; + internal::TensorIntDivisor m_fastOutputDepth; + + Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h new file mode 100644 index 00000000000..69ab6840c82 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -0,0 +1,620 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \internal + * + * \class TensorIndexList + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode a set of Tensor dimensions/indices. + * + * The indices in the list can be known at compile time or at runtime. A mix + * of static and dynamic indices can also be provided if needed. The tensor + * code will attempt to take advantage of the indices that are known at + * compile time to optimize the code it generates. + * + * This functionality requires a c++11 compliant compiler. If your compiler + * is older you need to use arrays of indices instead. + * + * Several examples are provided in the cxx11_tensor_index_list.cpp file. + * + * \sa Tensor + */ + +template +struct type2index { + static constexpr Index value = n; + EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; } + EIGEN_DEVICE_FUNC void set(Index val) { eigen_assert(val == n); } +}; + +// This can be used with IndexPairList to get compile-time constant pairs, +// such as IndexPairList, type2indexpair<3,4>>(). +template +struct type2indexpair { + static constexpr Index first = f; + static constexpr Index second = s; + + constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { return IndexPair(f, s); } + + EIGEN_DEVICE_FUNC void set(const IndexPair& val) { + eigen_assert(val.first == f); + eigen_assert(val.second == s); + } +}; + +template +struct NumTraits> { + typedef Index Real; + enum { IsComplex = 0, RequireInitialization = false, ReadCost = 1, AddCost = 1, MulCost = 1 }; + + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) { + val = internal::convert_index(new_val); +} +template +EIGEN_DEVICE_FUNC void update_value(type2index& val, Index new_val) { + val.set(new_val); +} + +template +EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { + val = new_val; +} +template +EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { + val.set(new_val); +} + +template +struct is_compile_time_constant { + static constexpr bool value = false; +}; + +template +struct is_compile_time_constant> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant&> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant&> { + static constexpr bool value = true; +}; + +template +struct is_compile_time_constant> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant&> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant&> { + static constexpr bool value = true; +}; + +template +struct IndexTuple; + +template +struct IndexTuple { + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() {} + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) {} + + static constexpr int count = 1 + sizeof...(O); + T head; + IndexTuple others; + typedef T Head; + typedef IndexTuple Other; +}; + +template +struct IndexTuple { + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() {} + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) {} + + constexpr static int count = 1; + T head; + typedef T Head; +}; + +template +struct IndexTupleExtractor; + +template +struct IndexTupleExtractor { + typedef typename IndexTupleExtractor::ValType ValType; + + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { + return IndexTupleExtractor::get_val(val.others); + } + + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { + return IndexTupleExtractor::get_val(val.others); + } + template + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { + IndexTupleExtractor::set_val(val.others, new_val); + } +}; + +template +struct IndexTupleExtractor<0, T, O...> { + typedef T ValType; + + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { return val.head; } + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { return val.head; } + template + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { + val.head = new_val; + } +}; + +template +EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor::ValType& array_get(IndexTuple& tuple) { + return IndexTupleExtractor::get_val(tuple); +} +template +EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor::ValType& array_get( + const IndexTuple& tuple) { + return IndexTupleExtractor::get_val(tuple); +} +template +struct array_size> { + static constexpr size_t value = IndexTuple::count; +}; +template +struct array_size> { + static constexpr size_t value = IndexTuple::count; +}; + +template +struct tuple_coeff { + template + EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple& t) { + // return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); + return (i == Idx ? array_get(t) : tuple_coeff::get(i, t)); + } + template + EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT& value) { + if (i == Idx) { + update_value(array_get(t), value); + } else { + tuple_coeff::set(i, t, value); + } + } + + template + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple& t) { + return ((i == Idx) && is_compile_time_constant::ValType>::value) || + tuple_coeff::value_known_statically(i, t); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple& t) { + return is_compile_time_constant::ValType>::value && + tuple_coeff::values_up_to_known_statically(t); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple& t) { + return is_compile_time_constant::ValType>::value && + is_compile_time_constant::ValType>::value && + array_get(t) > array_get(t) && + tuple_coeff::values_up_to_statically_known_to_increase(t); + } +}; + +template +struct tuple_coeff<0, ValueT> { + template + EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return array_get<0>(t) /* * (i == 0)*/; + } + template + EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT value) { + eigen_assert(i == 0); + update_value(array_get<0>(t), value); + } + template + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple&) { + return is_compile_time_constant::ValType>::value && (i == 0); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple&) { + return is_compile_time_constant::ValType>::value; + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple&) { + return true; + } +}; +} // namespace internal + +template +struct IndexList : internal::IndexTuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[](const Index i) const { + return internal::tuple_coeff>::value - 1, + Index>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const { + return internal::tuple_coeff>::value - 1, + Index>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) { + return internal::tuple_coeff>::value - 1, + Index>::set(i, *this, value); + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr std::size_t size() const { return 1 + sizeof...(OtherTypes); }; + + EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) + : internal::IndexTuple(other) {} + EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) + : internal::IndexTuple(first, other...) {} + EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() {} + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { + return internal::tuple_coeff>::value - 1, + Index>::value_known_statically(i, *this); + } + EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { + return internal::tuple_coeff>::value - 1, + Index>::values_up_to_known_statically(*this); + } + + EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { + return internal::tuple_coeff>::value - 1, + Index>::values_up_to_statically_known_to_increase(*this); + } +}; + +template +std::ostream& operator<<(std::ostream& os, const IndexList& dims) { + os << "["; + for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} + +template +constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { + return IndexList(val1, other_vals...); +} + +template +struct IndexPairList : internal::IndexTuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[](const Index i) const { + return internal::tuple_coeff>::value - 1, + IndexPair>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair value) { + return internal::tuple_coeff>::value - 1, + IndexPair>::set(i, *this, value); + } + + EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple& other) + : internal::IndexTuple(other) {} + EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple() {} + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { + return internal::tuple_coeff>::value - 1, + Index>::value_known_statically(i, *this); + } +}; + +namespace internal { + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList& sizes) { + Index result = 1; + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < array_size>::value; ++i) { + result *= sizes[i]; + } + return result; +} + +template +struct array_size> { + static const size_t value = array_size>::value; +}; +template +struct array_size> { + static const size_t value = array_size>::value; +}; + +template +struct array_size> { + static const size_t value = 1 + sizeof...(OtherTypes); +}; +template +struct array_size> { + static const size_t value = 1 + sizeof...(OtherTypes); +}; + +template +EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList& a) { + return IndexTupleExtractor::get_val(a); +} +template +EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList& a) { + return IndexTupleExtractor::get_val(a); +} + +template +struct index_known_statically_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index) { return false; } +}; + +template +struct index_known_statically_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { + return IndexList().value_known_statically(i); + } +}; + +template +struct index_known_statically_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { + return IndexList().value_known_statically(i); + } +}; + +template +struct all_indices_known_statically_impl { + static constexpr bool run() { return false; } +}; + +template +struct all_indices_known_statically_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return IndexList().all_values_known_statically(); + } +}; + +template +struct all_indices_known_statically_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return IndexList().all_values_known_statically(); + } +}; + +template +struct indices_statically_known_to_increase_impl { + EIGEN_DEVICE_FUNC static constexpr bool run() { return false; } +}; + +template +struct indices_statically_known_to_increase_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return Eigen::IndexList().values_statically_known_to_increase(); + } +}; + +template +struct indices_statically_known_to_increase_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return Eigen::IndexList().values_statically_known_to_increase(); + } +}; + +template +struct index_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) == value); + } +}; + +template +struct index_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) == value); + } +}; + +template +struct index_statically_ne_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_statically_ne_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) != value); + } +}; + +template +struct index_statically_ne_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) != value); + } +}; + +template +struct index_statically_gt_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_statically_gt_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) > value); + } +}; + +template +struct index_statically_gt_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) > value); + } +}; + +template +struct index_statically_lt_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_statically_lt_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) < value); + } +}; + +template +struct index_statically_lt_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) < value); + } +}; + +template +struct index_pair_first_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_pair_first_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) && + (IndexPairList().operator[](i).first == value); + } +}; + +template +struct index_pair_first_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) && + (IndexPairList().operator[](i).first == value); + } +}; + +template +struct index_pair_second_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_pair_second_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) && + (IndexPairList().operator[](i).second == value); + } +}; + +template +struct index_pair_second_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) && + (IndexPairList().operator[](i).second == value); + } +}; + +} // end namespace internal +} // end namespace Eigen + +namespace Eigen { +namespace internal { +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) { + return index_known_statically_impl::run(i); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() { + return all_indices_known_statically_impl::run(); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() { + return indices_statically_known_to_increase_impl::run(); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) { + return index_statically_eq_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) { + return index_statically_ne_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) { + return index_statically_gt_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) { + return index_statically_lt_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) { + return index_pair_first_statically_eq_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) { + return index_pair_second_statically_eq_impl::run(i, value); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h new file mode 100644 index 00000000000..5b0fca8b66a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -0,0 +1,226 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Ke Yang +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorInflation + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor inflation class. + * + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorInflationOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorInflationOp type; +}; + +} // end namespace internal + +template +class TensorInflationOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides) + : m_xpr(expr), m_strides(strides) {} + + EIGEN_DEVICE_FUNC const Strides& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Strides m_strides; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorInflationOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_strides(op.strides()) { + m_dimensions = m_impl.dimensions(); + // Expand each dimension to the inflated dimension. + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1; + } + + // Remember the strides for fast division. + for (int i = 0; i < NumDims; ++i) { + m_fastStrides[i] = internal::TensorIntDivisor(m_strides[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { // RowMajor + m_outputStrides[NumDims - 1] = 1; + m_inputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + // Computes the input index given the output index. Returns true if the output + // index doesn't fall into a hole. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const { + eigen_assert(index < dimensions().TotalSize()); + *inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[0] * m_strides[0]) { + return false; + } + *inputIndex += index / m_strides[0]; + return true; + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[NumDims - 1] * m_strides[NumDims - 1]) { + return false; + } + *inputIndex += index / m_strides[NumDims - 1]; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + Index inputIndex = 0; + if (getInputIndex(index, &inputIndex)) { + return m_impl.coeff(inputIndex); + } else { + return Scalar(0); + } + } + + // TODO(yangke): optimize this function so that we can detect and produce + // all-zero packets + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (3 * TensorOpCost::DivCost() + 3 * TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + const double input_size = m_impl.dimensions().TotalSize(); + const double output_size = m_dimensions.TotalSize(); + if (output_size == 0) return TensorOpCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Strides m_strides; + array, NumDims> m_fastStrides; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h new file mode 100644 index 00000000000..85cf492a001 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -0,0 +1,78 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H +#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H + +#include + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorInitializer + * \ingroup CXX11_Tensor_Module + * + * \brief Helper template to initialize Tensors from std::initializer_lists. + */ +namespace internal { + +template +struct Initializer { + typedef std::initializer_list::InitList> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + for (const auto& v : vals) { + (*indices)[traits::NumDimensions - N] = i++; + Initializer::run(tensor, indices, v); + } + } +}; + +template +struct Initializer { + typedef std::initializer_list::Scalar> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + // There is likely a faster way to do that than iterating. + for (const auto& v : vals) { + (*indices)[traits::NumDimensions - 1] = i++; + tensor.coeffRef(*indices) = v; + } + } +}; + +template +struct Initializer { + typedef typename traits::Scalar InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>*, const InitList& v) { + tensor.coeffRef(0) = v; + } +}; + +template +void initialize_tensor(TensorEvaluator& tensor, + const typename Initializer::NumDimensions>::InitList& vals) { + Eigen::array::Index, traits::NumDimensions> indices; + Initializer::NumDimensions>::run(tensor, &indices, vals); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h new file mode 100644 index 00000000000..fddc64822d0 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -0,0 +1,261 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H +#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \internal + * + * \class TensorIntDiv + * \ingroup CXX11_Tensor_Module + * + * \brief Fast integer division by a constant. + * + * See the paper from Granlund and Montgomery for explanation. + * (at https://doi.org/10.1145/773473.178249) + * + * \sa Tensor + */ + +namespace internal { + +// Note: result is undefined if val == 0 +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t count_leading_zeros(const T val) { +#ifdef EIGEN_GPU_COMPILE_PHASE + return __clz(val); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::clz(val); +#elif EIGEN_COMP_MSVC + unsigned long index; + _BitScanReverse(&index, val); + return 31 - index; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clz(static_cast(val)); +#endif +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t count_leading_zeros(const T val) { +#ifdef EIGEN_GPU_COMPILE_PHASE + return __clzll(val); +#elif defined(SYCL_DEVICE_ONLY) + return static_cast(cl::sycl::clz(val)); +#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 + unsigned long index; + _BitScanReverse64(&index, val); + return 63 - index; +#elif EIGEN_COMP_MSVC + // MSVC's _BitScanReverse64 is not available for 32bits builds. + unsigned int lo = (unsigned int)(val & 0xffffffff); + unsigned int hi = (unsigned int)((val >> 32) & 0xffffffff); + int n; + if (hi == 0) + n = 32 + count_leading_zeros(lo); + else + n = count_leading_zeros(hi); + return n; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clzll(static_cast(val)); +#endif +} + +template +struct UnsignedTraits { + typedef std::conditional_t type; +}; + +template +struct DividerTraits { + typedef typename UnsignedTraits::type type; + static constexpr int N = sizeof(T) * 8; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __umulhi(a, b); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::mul_hi(a, static_cast(b)); +#else + return (static_cast(a) * b) >> 32; +#endif +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __umul64hi(a, b); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::mul_hi(a, static_cast(b)); +#elif EIGEN_COMP_MSVC && (EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64) + return __umulh(a, static_cast(b)); +#elif EIGEN_HAS_BUILTIN_INT128 + __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); + return static_cast(v >> 64); +#else + return (TensorUInt128, uint64_t>(a) * TensorUInt128, uint64_t>(b)).upper(); +#endif +} + +template +struct DividerHelper { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) { + EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); + return static_cast((static_cast(1) << (N + log_div)) / divider - + (static_cast(1) << N) + 1); + } +}; + +template +struct DividerHelper<64, T> { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { +#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) + return static_cast((static_cast<__uint128_t>(1) << (64 + log_div)) / static_cast<__uint128_t>(divider) - + (static_cast<__uint128_t>(1) << 64) + 1); +#else + const uint64_t shift = 1ULL << log_div; + TensorUInt128 result = + TensorUInt128 >(shift, 0) / TensorUInt128, uint64_t>(divider) - + TensorUInt128, static_val<0> >(1, 0) + TensorUInt128, static_val<1> >(1); + return static_cast(result); +#endif + } +}; + +template +struct TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + multiplier = 0; + shift1 = 0; + shift2 = 0; + } + + // Must have 0 < divider < 2^31. This is relaxed to + // 0 < divider < 2^63 when using 64-bit indices on platforms that support + // the __uint128_t type. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = DividerTraits::N; + eigen_assert(static_cast::type>(divider) < NumTraits::highest() / 2); + eigen_assert(divider > 0); + + // fast ln2 + const int leading_zeros = count_leading_zeros(static_cast(divider)); + int log_div = N - leading_zeros; + // if divider is a power of two then log_div is 1 more than it should be. + if ((static_cast::type>(1) << (log_div - 1)) == + static_cast::type>(divider)) + log_div--; + + multiplier = DividerHelper::computeMultiplier(log_div, divider); + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div - 1 : 0; + } + + // Must have 0 <= numerator. On platforms that don't support the __uint128_t + // type numerator should also be less than 2^32-1. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + eigen_assert(static_cast::type>(numerator) < NumTraits::highest() / 2); + // eigen_assert(numerator >= 0); // this is implicitly asserted by the line above + + UnsignedType t1 = muluh(multiplier, numerator); + UnsignedType t = (static_cast(numerator) - t1) >> shift1; + return (t1 + t) >> shift2; + } + + private: + typedef typename DividerTraits::type UnsignedType; + UnsignedType multiplier; + int32_t shift1; + int32_t shift2; +}; + +// Optimized version for signed 32 bit integers. +// Derived from Hacker's Delight. +// Only works for divisors strictly greater than one +template <> +class TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + magic = 0; + shift = 0; + } + // Must have 2 <= divider + EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) { + eigen_assert(divider >= 2); + calcMagic(divider); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { +#ifdef EIGEN_GPU_COMPILE_PHASE + return (__umulhi(magic, n) >> shift); +#elif defined(SYCL_DEVICE_ONLY) + return (cl::sycl::mul_hi(magic, static_cast(n)) >> shift); +#else + uint64_t v = static_cast(magic) * static_cast(n); + return (static_cast(v >> 32) >> shift); +#endif + } + + private: + // Compute the magic numbers. See Hacker's Delight section 10 for an in + // depth explanation. + EIGEN_DEVICE_FUNC void calcMagic(int32_t d) { + const unsigned two31 = 0x80000000; // 2**31. + unsigned ad = d; + unsigned t = two31 + (ad >> 31); + unsigned anc = t - 1 - t % ad; // Absolute value of nc. + int p = 31; // Init. p. + unsigned q1 = two31 / anc; // Init. q1 = 2**p/|nc|. + unsigned r1 = two31 - q1 * anc; // Init. r1 = rem(2**p, |nc|). + unsigned q2 = two31 / ad; // Init. q2 = 2**p/|d|. + unsigned r2 = two31 - q2 * ad; // Init. r2 = rem(2**p, |d|). + unsigned delta = 0; + do { + p = p + 1; + q1 = 2 * q1; // Update q1 = 2**p/|nc|. + r1 = 2 * r1; // Update r1 = rem(2**p, |nc|). + if (r1 >= anc) { // (Must be an unsigned + q1 = q1 + 1; // comparison here). + r1 = r1 - anc; + } + q2 = 2 * q2; // Update q2 = 2**p/|d|. + r2 = 2 * r2; // Update r2 = rem(2**p, |d|). + if (r2 >= ad) { // (Must be an unsigned + q2 = q2 + 1; // comparison here). + r2 = r2 - ad; + } + delta = ad - r2; + } while (q1 < delta || (q1 == delta && r1 == 0)); + + magic = (unsigned)(q2 + 1); + shift = p - 32; + } + + uint32_t magic; + int32_t shift; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator/(const T& numerator, const TensorIntDivisor& divisor) { + return divisor.divide(numerator); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h new file mode 100644 index 00000000000..43f4bcb4656 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -0,0 +1,185 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorLayoutSwap + * \ingroup CXX11_Tensor_Module + * + * \brief Swap the layout from col-major to row-major, or row-major + * to col-major, and invert the order of the dimensions. + * + * Beware: the dimensions are reversed by this operation. If you want to + * preserve the ordering of the dimensions, you need to combine this + * operation with a shuffle. + * + * \example: + * Tensor input(2, 4); + * Tensor output = input.swap_layout(); + * eigen_assert(output.dimension(0) == 4); + * eigen_assert(output.dimension(1) == 2); + * + * array shuffle(1, 0); + * output = input.swap_layout().shuffle(shuffle); + * eigen_assert(output.dimension(0) == 2); + * eigen_assert(output.dimension(1) == 4); + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorLayoutSwapOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorLayoutSwapOp type; +}; + +} // end namespace internal + +template +class TensorLayoutSwapOp : public TensorBase, WriteAccessors> { + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp) + protected: + typename XprType::Nested m_xpr; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorLayoutSwapOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + static constexpr int Layout = + (TensorEvaluator::Layout == static_cast(ColMajor)) ? RowMajor : ColMajor; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = m_impl.dimensions()[NumDims - 1 - i]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_impl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); } + + const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + Dimensions m_dimensions; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorLayoutSwapOp XprType; + + static constexpr int Layout = + (TensorEvaluator::Layout == static_cast(ColMajor)) ? RowMajor : ColMajor; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false // to be implemented + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + return this->m_impl.coeffRef(index); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + this->m_impl.template writePacket(index, x); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h new file mode 100644 index 00000000000..f8bbcfee74a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -0,0 +1,85 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H + +/** use this macro in sfinae selection in templated functions + * + * template::value , int > = 0 + * > + * void foo(){} + * + * becomes => + * + * template::value ) + * > + * void foo(){} + */ + +#define EIGEN_SFINAE_ENABLE_IF(__condition__) std::enable_if_t<(__condition__), int> = 0 + +// Define a macro to use a reference on the host but a value on the device +#if defined(SYCL_DEVICE_ONLY) +#define EIGEN_DEVICE_REF +#else +#define EIGEN_DEVICE_REF & +#endif + +// Define a macro for catching SYCL exceptions if exceptions are enabled +#define EIGEN_SYCL_TRY_CATCH(X) \ + do { \ + EIGEN_TRY { X; } \ + EIGEN_CATCH(const cl::sycl::exception& e) { \ + EIGEN_THROW_X(std::runtime_error("SYCL exception at " + std::string(__FILE__) + ":" + std::to_string(__LINE__) + \ + "\n" + e.what())); \ + } \ + } while (false) + +// Define a macro if local memory flags are unset or one of them is set +// Setting both flags is the same as unsetting them +#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \ + (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)) +#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 +#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) +#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 +#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) +#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#endif + +#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) +#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + using Base::operator=; \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { \ + Base::operator=(other); \ + return *this; \ + } \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { \ + Base::operator=(other); \ + return *this; \ + } +#else +#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) +#endif + +/** \internal + * \brief Macro to manually inherit assignment operators. + * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is + * defined. This also inherits template operator=(const OtherDerived&) assignments. With C++11 or later + * this also default-implements the copy-constructor + */ +#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived) \ + EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived) + +#endif diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h new file mode 100644 index 00000000000..191413b7d03 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -0,0 +1,191 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) + +/** \class TensorMap + * \ingroup CXX11_Tensor_Module + * + * \brief A tensor expression mapping an existing array of data. + * + */ +/// `template class MakePointer_` is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler `T*` is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. +/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . +/// Therefore, by adding the default value, we managed to convert the type and it does not break any +/// existing code as its default value is `T*`. +template class MakePointer_> +class TensorMap : public TensorBase > { + public: + typedef TensorMap Self; + typedef TensorBase > Base; +#ifdef EIGEN_USE_SYCL + typedef std::remove_reference_t::type> Nested; +#else + typedef typename Eigen::internal::nested::type Nested; +#endif + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType; + + typedef typename MakePointer_::Type PointerType; + typedef typename MakePointer_::ConstType PointerConstType; + + // WARN: PointerType still can be a pointer to const (const Scalar*), for + // example in TensorMap> expression. This type of + // expression should be illegal, but adding this restriction is not possible + // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488). + typedef std::conditional_t::value), + PointerType, // use simple pointer in lvalue expressions + PointerConstType // use const pointer in rvalue expressions + > + StoragePointerType; + + // If TensorMap was constructed over rvalue expression (e.g. const Tensor), + // we should return a reference to const from operator() (and others), even + // if TensorMap itself is not const. + typedef std::conditional_t::value), Scalar&, const Scalar&> StorageRefType; + + static constexpr int Options = Options_; + + static constexpr Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + static constexpr int Layout = PlainObjectType::Layout; + enum { IsAligned = ((int(Options_) & Aligned) == Aligned), CoordAccess = true, RawAccess = true }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, + IndexTypes... otherDimensions) + : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), + YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, + const array& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor) + : m_data(tensor.data()), m_dimensions(tensor.dimensions()) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) const { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options & RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()() const { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_data[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, + IndexTypes... otherIndices) const { + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); + if (PlainObjectType::Options & RowMajor) { + const Index index = + m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = + m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options & RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_data[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index index) { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, + IndexTypes... otherIndices) { + static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, + "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); + const std::size_t NumDims = sizeof...(otherIndices) + 2; + if (PlainObjectType::Options & RowMajor) { + const Index index = + m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = + m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } + } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap) + + private: + StoragePointerType m_data; + Dimensions m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h new file mode 100644 index 00000000000..8c2bb2ef9e2 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -0,0 +1,292 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +template +struct Cond {}; + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& choose(Cond, const T1& first, const T2&) { + return first; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T2& choose(Cond, const T1&, const T2& second) { + return second; +} + +template +struct max_n_1 { + static const size_t size = n; +}; +template <> +struct max_n_1<0> { + static const size_t size = 1; +}; + +template +EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T divup(const T x, const T y) { + return Eigen::numext::div_ceil(x, y); +} + +// Default packet types +template +struct PacketType : internal::packet_traits { + typedef typename internal::packet_traits::type type; +}; + +// For CUDA packet types when using a GpuDevice +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE) + +typedef ulonglong2 Packet4h2; +template <> +struct PacketType { + typedef Packet4h2 type; + static const int size = 8; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasExpm1 = 0, + HasLog = 1, + HasLog1p = 0, + HasLog10 = 0, + HasPow = 1, + }; +}; +#endif + +#if defined(EIGEN_USE_SYCL) + +namespace TensorSycl { +namespace internal { + +template +struct PlusOp { + static constexpr Index Value = A + B; +}; + +template +struct DivOp { + static constexpr Index Value = A / B; +}; + +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) { + op(start); + static_for::Value, end, step, StepOp>::loop(op); + } +}; +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {} +}; + +template +struct Vectorise { + static constexpr int PacketSize = 1; + typedef OutScalar PacketReturnType; +}; + +template +struct Vectorise { + static constexpr int PacketSize = Eigen::PacketType::size; + typedef typename Eigen::PacketType::type PacketReturnType; +}; + +static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { return ((((x) + (y)-1) / (y)) * (y)); } + +} // namespace internal +} // namespace TensorSycl + +template <> +struct PacketType { + typedef half type; + static const int size = 1; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0 + }; +}; +template +struct PacketType : internal::default_packet_traits { + typedef Scalar type; + typedef Scalar half; + enum { + Vectorizable = 0, + size = 1, + AlignedOnScalar = 0, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 + }; +}; + +template +struct PacketType : PacketType {}; + +#ifndef EIGEN_DONT_VECTORIZE_SYCL +#define PACKET_TYPE(CVQual, Type, val, lengths, DEV) \ + template <> \ + struct PacketType : internal::sycl_packet_traits { \ + typedef typename internal::packet_traits::type type; \ + typedef typename internal::packet_traits::half half; \ + }; + +PACKET_TYPE(const, float, 1, 4, SyclDevice) +PACKET_TYPE(, float, 1, 4, SyclDevice) +PACKET_TYPE(const, float, 1, 4, const SyclDevice) +PACKET_TYPE(, float, 1, 4, const SyclDevice) + +PACKET_TYPE(const, double, 0, 2, SyclDevice) +PACKET_TYPE(, double, 0, 2, SyclDevice) +PACKET_TYPE(const, double, 0, 2, const SyclDevice) +PACKET_TYPE(, double, 0, 2, const SyclDevice) +#undef PACKET_TYPE + +template <> +struct PacketType : PacketType {}; +template <> +struct PacketType : PacketType {}; +#endif +#endif + +// Pair mimics std::pair but works on e.g. nvcc. +template +struct Pair { + public: + EIGEN_MAKE_ALIGNED_OPERATOR_NEW + + U first; + V second; + + typedef U first_type; + typedef V second_type; + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair() : first(), second() {} + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair(const U& f, const V& s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(Pair& rhs) { + using numext::swap; + swap(first, rhs.first); + swap(second, rhs.second); + } +}; + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const Pair& x, const Pair& y) { + return (x.first == y.first && x.second == y.second); +} + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const Pair& x, const Pair& y) { + return !(x == y); +} + +// Can't use std::pairs on cuda devices +template +struct IndexPair { + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC void set(IndexPair val) { + first = val.first; + second = val.second; + } + + Idx first; + Idx second; +}; + +namespace internal { + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array( + IndexType& idx, numeric_list) { + return {static_cast(idx[First]), static_cast(idx[Is])...}; +} +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType&, + numeric_list) { + return array(); +} + +/** Make an array (for index/dimensions) out of a custom index */ +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType& idx) { + return customIndices2Array(idx, typename gen_numeric_list::type{}); +} + +template +struct is_base_of { + typedef char (&yes)[1]; + typedef char (&no)[2]; + + template + struct Host { + operator BB*() const; + operator DD*(); + }; + + template + static yes check(D*, T); + static no check(B*, int); + + static const bool value = sizeof(check(Host(), int())) == sizeof(yes); +}; + +} // namespace internal + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h new file mode 100644 index 00000000000..d790f35bb9c --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -0,0 +1,984 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H +#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits> : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = array_size::value; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorReshapingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorReshapingOp type; +}; + +} // end namespace internal + +template +class TensorReshapingOp : public TensorBase, WriteAccessors> { + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC const NewDimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp) + + protected: + typename XprType::Nested m_xpr; + const NewDimensions m_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory, Device> ConstCastStorage; + + static constexpr int NumOutputDims = internal::array_size::value; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + + enum ReshapingKind { + // We do not use layout information to determine reshaping kind. + // Depending on the layout `N` can be inner or outer dimension. + OneByN = 0, // expr.reshape(1, N) + NByOne = 1, // expr.reshape(N, 1) + Runtime = 2 // Reshape dimensions are dynamic (specified at runtime). + }; + + // clang-format off + static const ReshapingKind kind = + (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/0, /*value=*/1)) ? OneByN + : (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/1, /*value=*/1)) ? NByOne + : Runtime; + // clang-format on + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + // For trivial reshapes with raw access to underlying data we will provide + // zero overhead block access. + // TODO(ezhulenev): Consider adding block access without raw access? + BlockAccess = TensorEvaluator::RawAccess && NumInputDims > 0 && NumOutputDims > 0, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { + // The total size of the reshaped tensor must be equal to the total size + // of the input tensor. + eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(data, std::move(done)); + } +#endif + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_impl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + // required in block(OutputTensorBlock* output_block) const + // For C++03 compatibility this must be defined outside the method + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_impl.data() != NULL); + eigen_assert((kind == Runtime) || (kind == OneByN && desc.dimensions()[0] == 1) || + (kind == NByOne && desc.dimensions()[1] == 1)); + + if (kind == OneByN || kind == NByOne) { + // We can guarantee at compile time that block is just a contiguous slice + // of the underlying expression memory buffer. + return TensorBlock(internal::TensorBlockKind::kView, m_impl.data() + desc.offset(), desc.dimensions()); + } else { + // This will do additional runtime checks, and in the end it might be also + // a view, or it might be a block materialized in the temporary buffer. + return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, scratch); + } + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); } + + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + NewDimensions m_dimensions; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> + +{ + typedef TensorEvaluator, Device> Base; + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + return this->m_impl.coeffRef(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + this->m_impl.template writePacket(index, x); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment + TensorBlockAssign; + + TensorBlockAssign::Run(TensorBlockAssign::target(desc.dimensions(), internal::strides(this->dimensions()), + this->m_impl.data(), desc.offset()), + block.expr()); + } +}; + +/** \class TensorSlicing + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor slicing class. + * + * + */ +namespace internal { +template +struct traits> : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = array_size::value; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorSlicingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorSlicingOp type; +}; + +} // end namespace internal + +template +class TensorSlicingOp : public TensorBase> { + public: + typedef TensorBase> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, + const Sizes& sizes) + : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} + + EIGEN_DEVICE_FUNC const StartIndices& startIndices() const { return m_indices; } + EIGEN_DEVICE_FUNC const Sizes& sizes() const { return m_sizes; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp) + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_indices; + const Sizes m_sizes; +}; + +namespace internal { + +// Fixme: figure out the exact threshold +template +struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) {} + EIGEN_DEVICE_FUNC bool operator()(Index total, Index contiguous) const { + const bool prefer_block_evaluation = BlockAccess && total > 32 * 1024; + return !prefer_block_evaluation && contiguous > threshold_; + } + + private: + Index threshold_; +}; + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_GPU +template +struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) {} + EIGEN_DEVICE_FUNC bool operator()(Index, Index contiguous) const { return contiguous > 4 * 1024 * 1024; } +}; +#endif + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_SYCL +template +struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) {} + EIGEN_DEVICE_FUNC bool operator()(Index, Index contiguous) const { return contiguous > 4 * 1024 * 1024; } +}; +#endif + +} // namespace internal + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorSlicingOp XprType; + static constexpr int NumDims = internal::array_size::value; + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + typedef StorageMemory Storage; + typedef StorageMemory, Device> ConstCastStorage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess && + // FIXME: Temporary workaround for bug in slicing of bool tensors. + !internal::is_same, bool>::value, + PreferBlockAccess = true, + CoordAccess = false, + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + // Tensor slicing does not change the block type. + typedef typename TensorEvaluator::TensorBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { + m_is_identity = true; + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + if (m_impl.dimensions()[i] != op.sizes()[i] || op.startIndices()[i] != 0) { + m_is_identity = false; + } + } + + // No strides for scalars. + if (NumDims == 0) return; + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Sizes& output_dims = op.sizes(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * output_dims[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } else { + m_inputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + + // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed. + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * output_dims[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (!NumTraits>::RequireInitialization && data && m_impl.data()) { + Index contiguous_values = 1; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } + // Use memcpy if it's going to be faster than using the regular evaluation. + const internal::MemcpyTriggerForSlicing trigger(m_device); + if (trigger(internal::array_prod(dimensions()), contiguous_values)) { + EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); + for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + Index offset = srcCoeff(i); + m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src + offset), + contiguous_values * sizeof(Scalar)); + } + return false; + } + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + const int packetSize = PacketType::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < internal::array_prod(dimensions())); + + if (m_is_identity) { + return m_impl.template packet(index); + } + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[NumDims - 1]); + inputIndices[1] += (indices[1] + m_offsets[NumDims - 1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize - 1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP + for (int i = 1; i < packetSize - 1; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset())); + TensorBlock block = m_impl.block(arg_desc, scratch); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + return block; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); + if (result) { + Index offset = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i + 1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i - 1; j >= 0; --j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } + return result + offset; + } + return NULL; + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[NumDims - 1]); + } + return inputIndex; + } + + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Device EIGEN_DEVICE_REF m_device; + Dimensions m_dimensions; + bool m_is_identity; + const StartIndices m_offsets; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorSlicingOp XprType; + static constexpr int NumDims = internal::array_size::value; + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + if (this->m_is_identity) { + this->m_impl.template writePacket(index, x); + return; + } + + const int packetSize = PacketType::size; + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[NumDims - 1]); + inputIndices[1] += (indices[1] + this->m_offsets[NumDims - 1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket(inputIndices[0], x); + } else { + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + internal::pstore(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize - 1]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < packetSize - 1; ++i) { + this->coeffRef(index + i) = values[i]; + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) { + TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset())); + this->m_impl.writeBlock(arg_desc, block); + } +}; + +namespace internal { +template +struct traits> : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = array_size::value; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorStridingSlicingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorStridingSlicingOp type; +}; + +} // end namespace internal + +template +class TensorStridingSlicingOp + : public TensorBase> { + public: + typedef TensorBase> Base; + typedef typename internal::traits::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(const XprType& expr, const StartIndices& startIndices, + const StopIndices& stopIndices, const Strides& strides) + : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), m_strides(strides) {} + + EIGEN_DEVICE_FUNC const StartIndices& startIndices() const { return m_startIndices; } + EIGEN_DEVICE_FUNC const StartIndices& stopIndices() const { return m_stopIndices; } + EIGEN_DEVICE_FUNC const StartIndices& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp) + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_startIndices; + const StopIndices m_stopIndices; + const Strides m_strides; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorStridingSlicingOp XprType; + static constexpr int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef Strides Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) { + // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero + DSizes startIndicesClamped, stopIndicesClamped; + for (ptrdiff_t i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); + if (m_strides[i] > 0) { + startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + } else { + /* implies m_strides[i] < 0 by assert */ + startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + } + m_startIndices[i] = startIndicesClamped[i]; + } + + typedef typename TensorEvaluator::Dimensions InputDimensions; + const InputDimensions& input_dims = m_impl.dimensions(); + + // compute output tensor shape + m_is_identity = true; + for (int i = 0; i < NumDims; i++) { + Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; + if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) { + m_dimensions[i] = 0; + } else { + m_dimensions[i] = (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0); + eigen_assert(m_dimensions[i] >= 0); + } + if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) { + m_is_identity = false; + } + } + + Strides output_dims = m_dimensions; + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = m_strides[0]; + m_offsets[0] = startIndicesClamped[0]; + Index previousDimProduct = 1; + for (int i = 1; i < NumDims; ++i) { + previousDimProduct *= input_dims[i - 1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * output_dims[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } else { + m_inputStrides[NumDims - 1] = m_strides[NumDims - 1]; + m_offsets[NumDims - 1] = startIndicesClamped[NumDims - 1]; + Index previousDimProduct = 1; + for (int i = NumDims - 2; i >= 0; --i) { + previousDimProduct *= input_dims[i + 1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * output_dims[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } + return inputIndex; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { +#ifndef SYCL_DEVICE_ONLY + return numext::maxi(min, numext::mini(max, value)); +#else + return cl::sycl::clamp(value, min, max); +#endif + } + + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + bool m_is_identity; + TensorEvaluator m_impl; + const Device EIGEN_DEVICE_REF m_device; + DSizes m_startIndices; // clamped startIndices + DSizes m_dimensions; + DSizes m_offsets; // offset in a flattened shape + const Strides m_strides; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorStridingSlicingOp XprType; + static constexpr int NumDims = internal::array_size::value; + static constexpr int Layout = TensorEvaluator::Layout; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = TensorEvaluator::CoordAccess, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Strides Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h new file mode 100644 index 00000000000..0e8f54a8052 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -0,0 +1,620 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorPadding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor padding class. + * At the moment only padding with a constant value is supported. + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorPaddingOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorPaddingOp type; +}; + +} // end namespace internal + +template +class TensorPaddingOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, + const Scalar padding_value) + : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC const PaddingDimensions& padding() const { return m_padding_dims; } + EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PaddingDimensions m_padding_dims; + const Scalar m_padding_value; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorPaddingOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + CoordAccess = true, + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) { + // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead + // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector + // of 1 element first and then pad. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Compute dimensions + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] += m_padding[i].first + m_padding[i].second; + } + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + m_outputStrides[NumDims] = m_outputStrides[NumDims - 1] * m_dimensions[NumDims - 1]; + } else { + m_inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i + 1] = m_outputStrides[i + 2] * m_dimensions[i + 1]; + } + m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (isPaddingAtIndexForDim(idx, i)) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (isPaddingAtIndexForDim(index, 0)) { + return m_paddingValue; + } + inputIndex += (index - m_padding[0].first); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i + 1]; + if (isPaddingAtIndexForDim(idx, i)) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i + 1]; + } + if (isPaddingAtIndexForDim(index, NumDims - 1)) { + return m_paddingValue; + } + inputIndex += (index - m_padding[NumDims - 1].first); + } + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + if (static_cast(Layout) == static_cast(ColMajor)) { + return packetColMajor(index); + } + return packetRowMajor(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + TensorOpCost cost = m_impl.costPerCoeff(vectorized); + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims; ++i) updateCostPerDimension(cost, i, i == 0); + } else { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i >= 0; --i) updateCostPerDimension(cost, i, i == NumDims - 1); + } + return cost; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // If one of the dimensions is zero, return empty block view. + if (desc.size() == 0) { + return TensorBlock(internal::TensorBlockKind::kView, NULL, desc.dimensions()); + } + + static const bool IsColMajor = Layout == static_cast(ColMajor); + const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; + + Index offset = desc.offset(); + + // Compute offsets in the output tensor corresponding to the desc.offset(). + DSizes output_offsets; + for (int i = NumDims - 1; i > 0; --i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + const int stride_dim = IsColMajor ? dim : dim + 1; + output_offsets[dim] = offset / m_outputStrides[stride_dim]; + offset -= output_offsets[dim] * m_outputStrides[stride_dim]; + } + output_offsets[inner_dim_idx] = offset; + + // Offsets in the input corresponding to output offsets. + DSizes input_offsets = output_offsets; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; + } + + // Compute offset in the input buffer (at this point it might be illegal and + // point outside of the input buffer, because we don't check for negative + // offsets, it will be autocorrected in the block iteration loop below). + Index input_offset = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offset += input_offsets[dim] * m_inputStrides[dim]; + } + + // Destination buffer and scratch buffer both indexed from 0 and have the + // same dimensions as the requested block (for destination buffer this + // property is guaranteed by `desc.destination()`). + Index output_offset = 0; + const DSizes output_strides = internal::strides(desc.dimensions()); + + // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` + // dimensions, skipping innermost dimension. In theory it should be possible + // to squeeze matching innermost dimensions, however in practice that did + // not show any improvements in benchmarks. Also in practice first outer + // dimension usually has padding, and will prevent squeezing. + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims - 1; ++i) { + const int dim = IsColMajor ? i + 1 : NumDims - i - 2; + it[i].count = 0; + it[i].size = desc.dimension(dim); + + it[i].input_stride = m_inputStrides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + it[i].output_stride = output_strides[dim]; + it[i].output_span = it[i].output_stride * (it[i].size - 1); + } + + const Index input_inner_dim_size = static_cast(m_impl.dimensions()[inner_dim_idx]); + + // Total output size. + const Index output_size = desc.size(); + + // We will fill inner dimension of this size in the output. It might be + // larger than the inner dimension in the input, so we might have to pad + // before/after we copy values from the input inner dimension. + const Index output_inner_dim_size = desc.dimension(inner_dim_idx); + + // How many values to fill with padding BEFORE reading from the input inner + // dimension. + const Index output_inner_pad_before_size = + input_offsets[inner_dim_idx] < 0 + ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), output_inner_dim_size) + : 0; + + // How many values we can actually copy from the input inner dimension. + const Index output_inner_copy_size = numext::mini( + // Want to copy from input. + (output_inner_dim_size - output_inner_pad_before_size), + // Can copy from input. + numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] + output_inner_pad_before_size), Index(0))); + + eigen_assert(output_inner_copy_size >= 0); + + // How many values to fill with padding AFTER reading from the input inner + // dimension. + const Index output_inner_pad_after_size = + (output_inner_dim_size - output_inner_copy_size - output_inner_pad_before_size); + + // Sanity check, sum of all sizes must be equal to the output size. + eigen_assert(output_inner_dim_size == + (output_inner_pad_before_size + output_inner_copy_size + output_inner_pad_after_size)); + + // Keep track of current coordinates and padding in the output. + DSizes output_coord = output_offsets; + DSizes output_padded; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + + typedef internal::StridedLinearBufferCopy LinCopy; + + // Prepare storage for the materialized padding result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + + // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a + // single logical inner dimension. + + // When possible we squeeze writes for the innermost (only if non-padded) + // dimension with the first padded dimension. This allows to reduce the + // number of calls to LinCopy and better utilize vector instructions. + const bool squeeze_writes = NumDims > 1 && + // inner dimension is not padded + (input_inner_dim_size == m_dimensions[inner_dim_idx]) && + // and equal to the block inner dimension + (input_inner_dim_size == output_inner_dim_size); + + const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1; + + // Maximum coordinate on a squeeze dimension that we can write to. + const Index squeeze_max_coord = + squeeze_writes ? numext::mini( + // max non-padded element in the input + static_cast(m_dimensions[squeeze_dim] - m_padding[squeeze_dim].second), + // max element in the output buffer + static_cast(output_offsets[squeeze_dim] + desc.dimension(squeeze_dim))) + : static_cast(0); + + // Iterate copying data from `m_impl.data()` to the output buffer. + for (Index size = 0; size < output_size;) { + // Detect if we are in the padded region (exclude innermost dimension). + bool is_padded = false; + for (int j = 1; j < NumDims; ++j) { + const int dim = IsColMajor ? j : NumDims - j - 1; + is_padded = output_padded[dim]; + if (is_padded) break; + } + + if (is_padded) { + // Fill single innermost dimension with padding value. + size += output_inner_dim_size; + + LinCopy::template Run(typename LinCopy::Dst(output_offset, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_dim_size); + + } else if (squeeze_writes) { + // Squeeze multiple reads from innermost dimensions. + const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim]; + size += output_inner_dim_size * squeeze_num; + + // Copy `squeeze_num` inner dimensions from input to output. + LinCopy::template Run(typename LinCopy::Dst(output_offset, 1, block_storage.data()), + typename LinCopy::Src(input_offset, 1, m_impl.data()), + output_inner_dim_size * squeeze_num); + + // Update iteration state for only `squeeze_num - 1` processed inner + // dimensions, because we have another iteration state update at the end + // of the loop that will update iteration state for the last inner + // processed dimension. + it[0].count += (squeeze_num - 1); + input_offset += it[0].input_stride * (squeeze_num - 1); + output_offset += it[0].output_stride * (squeeze_num - 1); + output_coord[squeeze_dim] += (squeeze_num - 1); + + } else { + // Single read from innermost dimension. + size += output_inner_dim_size; + + { // Fill with padding before copying from input inner dimension. + const Index out = output_offset; + + LinCopy::template Run(typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_before_size); + } + + { // Copy data from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size; + const Index in = input_offset + output_inner_pad_before_size; + + eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL); + + LinCopy::template Run(typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(in, 1, m_impl.data()), + output_inner_copy_size); + } + + { // Fill with padding after copying from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size + output_inner_copy_size; + + LinCopy::template Run(typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_after_size); + } + } + + for (int j = 0; j < NumDims - 1; ++j) { + const int dim = IsColMajor ? j + 1 : NumDims - j - 2; + + if (++it[j].count < it[j].size) { + input_offset += it[j].input_stride; + output_offset += it[j].output_stride; + output_coord[dim] += 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + break; + } + it[j].count = 0; + input_offset -= it[j].input_span; + output_offset -= it[j].output_span; + output_coord[dim] -= it[j].size - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + + private: + struct BlockIteratorState { + BlockIteratorState() : count(0), size(0), input_stride(0), input_span(0), output_stride(0), output_span(0) {} + + Index count; + Index size; + Index input_stride; + Index input_span; + Index output_stride; + Index output_span; + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(Index index, int dim_index) const { + return (!internal::index_pair_first_statically_eq(dim_index, 0) && + index < m_padding[dim_index].first) || + (!internal::index_pair_second_statically_eq(dim_index, 0) && + index >= m_dimensions[dim_index] - m_padding[dim_index].second); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(int dim_index) const { + return internal::index_pair_first_statically_eq(dim_index, 0); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(int dim_index) const { + return internal::index_pair_second_statically_eq(dim_index, 0); + } + + void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { + const double in = static_cast(m_impl.dimensions()[i]); + const double out = in + m_padding[i].first + m_padding[i].second; + if (out == 0) return; + const double reduction = in / out; + cost *= reduction; + if (first) { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + reduction * (1 * TensorOpCost::AddCost())); + } else { + cost += TensorOpCost(0, 0, + 2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + reduction * (2 * TensorOpCost::MulCost() + 1 * TensorOpCost::DivCost())); + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const Index lastPaddedRight = m_outputStrides[i + 1]; + + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || + (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastPaddedLeft = m_padding[0].first; + const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const Index lastPaddedRight = m_outputStrides[1]; + + if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || + (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[0].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i + 1]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i + 1]; + const Index lastPaddedRight = m_outputStrides[i]; + + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || + (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i + 1]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i + 1]; + } else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastPaddedLeft = m_padding[NumDims - 1].first; + const Index firstPaddedRight = (m_dimensions[NumDims - 1] - m_padding[NumDims - 1].second); + const Index lastPaddedRight = m_outputStrides[NumDims - 1]; + + if (!isLeftPaddingCompileTimeZero(NumDims - 1) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if (!isRightPaddingCompileTimeZero(NumDims - 1) && firstIdx >= firstPaddedRight && + lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if ((isLeftPaddingCompileTimeZero(NumDims - 1) && isRightPaddingCompileTimeZero(NumDims - 1)) || + (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[NumDims - 1].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + PaddingDimensions m_padding; + + Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h new file mode 100644 index 00000000000..7b33f1fe19c --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -0,0 +1,260 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorPatch + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor patch class. + * + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions + 1; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorPatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorPatchOp type; +}; + +} // end namespace internal + +template +class TensorPatchOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) + : m_xpr(expr), m_patch_dims(patch_dims) {} + + EIGEN_DEVICE_FUNC const PatchDim& patch_dims() const { return m_patch_dims; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PatchDim m_patch_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorPatchOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value + 1; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + Index num_patches = 1; + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const PatchDim& patch_dims = op.patch_dims(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims - 1; ++i) { + m_dimensions[i] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[NumDims - 1] = num_patches; + + m_inputStrides[0] = 1; + m_patchStrides[0] = 1; + for (int i = 1; i < NumDims - 1; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + m_patchStrides[i] = m_patchStrides[i - 1] * (input_dims[i - 1] - patch_dims[i - 1] + 1); + } + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + m_dimensions[i + 1] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[0] = num_patches; + + m_inputStrides[NumDims - 2] = 1; + m_patchStrides[NumDims - 2] = 1; + for (int i = NumDims - 3; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + m_patchStrides[i] = m_patchStrides[i + 1] * (input_dims[i + 1] - patch_dims[i + 1] + 1); + } + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; + // Find the location of the first element of the patch. + Index patchIndex = index / m_outputStrides[output_stride_index]; + // Find the offset of the element wrt the location of the first element. + Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i]; + patchOffset -= offsetIdx * m_outputStrides[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i + 1]; + patchOffset -= offsetIdx * m_outputStrides[i + 1]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } + inputIndex += (patchIndex + patchOffset); + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; + Index indices[2] = {index, index + PacketSize - 1}; + Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], + indices[1] / m_outputStrides[output_stride_index]}; + Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], + indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]}; + + Index inputIndices[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], patchOffsets[1] / m_outputStrides[i]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i + 1], patchOffsets[1] / m_outputStrides[i + 1]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i + 1]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i + 1]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } + inputIndices[0] += (patchIndices[0] + patchOffsets[0]); + inputIndices[1] += (patchIndices[1] + patchOffsets[1]); + + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } else { + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[PacketSize - 1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize - 1; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (TensorOpCost::DivCost() + TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + array m_patchStrides; + + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h new file mode 100644 index 00000000000..c9c613a1489 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -0,0 +1,309 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// Copyright (C) 2018 Mehdi Goli Codeplay Software Ltd. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H +#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() { +#if defined(EIGEN_GPU_COMPILE_PHASE) + // We don't support 3d kernels since we currently only use 1 and + // 2d kernels. + gpu_assert(threadIdx.z == 0); + return blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); +#else + // Rely on Eigen's random implementation. + return random(); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { + // TODO: Unify with the implementation in the non blocking thread pool. + uint64_t current = *state; + // Update the internal state + *state = current * 6364136223846793005ULL + (stream << 1 | 1); + // Generate the random output (using the PCG-XSH-RS scheme) + return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { + seed = seed ? seed : get_random_seed(); + return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t* state, uint64_t stream) { + unsigned rnd = PCG_XSH_RS_generator(state, stream); + return static_cast(rnd); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform(uint64_t* state, uint64_t stream) { + // Generate 10 random bits for the mantissa, merge with exponent. + unsigned rnd = PCG_XSH_RS_generator(state, stream); + const uint16_t half_bits = static_cast(rnd & 0x3ffu) | (static_cast(15) << 10); + Eigen::half result = Eigen::numext::bit_cast(half_bits); + // Return the final result + return result - Eigen::half(1.0f); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform(uint64_t* state, + uint64_t stream) { + // Generate 7 random bits for the mantissa, merge with exponent. + unsigned rnd = PCG_XSH_RS_generator(state, stream); + const uint16_t half_bits = static_cast(rnd & 0x7fu) | (static_cast(127) << 7); + Eigen::bfloat16 result = Eigen::numext::bit_cast(half_bits); + // Return the final result + return result - Eigen::bfloat16(1.0f); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform(uint64_t* state, uint64_t stream) { + typedef union { + uint32_t raw; + float fp; + } internal; + internal result; + // Generate 23 random bits for the mantissa mantissa + const unsigned rnd = PCG_XSH_RS_generator(state, stream); + result.raw = rnd & 0x7fffffu; + // Set the exponent + result.raw |= (static_cast(127) << 23); + // Return the final result + return result.fp - 1.0f; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform(uint64_t* state, uint64_t stream) { + typedef union { + uint64_t raw; + double dp; + } internal; + internal result; + result.raw = 0; + // Generate 52 random bits for the mantissa + // First generate the upper 20 bits + unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; + // The generate the lower 32 bits + unsigned rnd2 = PCG_XSH_RS_generator(state, stream); + result.raw = (static_cast(rnd1) << 32) | rnd2; + // Set the exponent + result.raw |= (static_cast(1023) << 52); + // Return the final result + return result.dp - 1.0; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex RandomToTypeUniform >(uint64_t* state, + uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), RandomToTypeUniform(state, stream)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex RandomToTypeUniform >(uint64_t* state, + uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), RandomToTypeUniform(state, stream)); +} + +template +class UniformRandomGenerator { + public: + static constexpr bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); +#ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefore, we need two steps to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each + // thread but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each + // thread adds the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the + // construction similar to CUDA Therefore, the thread Id injection is not available at this stage. + // However when the operator() is called the thread ID will be available. So inside the operator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + // to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once = false; +#endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator& other) { + m_state = other.m_state; +#ifdef EIGEN_USE_SYCL + m_exec_once = other.m_exec_once; +#endif + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { +#ifdef EIGEN_USE_SYCL + if (!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side + m_state += (i * 6364136223846793005ULL); + m_exec_once = true; + } +#endif + T result = RandomToTypeUniform(&m_state, i); + return result; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX T values[packetSize]; +#ifdef EIGEN_USE_SYCL + if (!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once = true; + } +#endif + EIGEN_UNROLL_LOOP + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeUniform(&m_state, i); + } + return internal::pload(values); + } + + private: + mutable uint64_t m_state; +#ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; +#endif +}; + +template +struct functor_traits > { + enum { + // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). + Cost = 12 * NumTraits::AddCost * ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), + PacketAccess = UniformRandomGenerator::PacketAccess + }; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t* state, uint64_t stream) { + // Use the ratio of uniform method to generate numbers following a normal + // distribution. See for example Numerical Recipes chapter 7.3.9 for the + // details. + T u, v, q; + do { + u = RandomToTypeUniform(state, stream); + v = T(1.7156) * (RandomToTypeUniform(state, stream) - T(0.5)); + const T x = u - T(0.449871); + const T y = numext::abs(v) + T(0.386595); + q = x * x + y * (T(0.196) * y - T(0.25472) * x); + } while (q > T(0.27597) && (q > T(0.27846) || v * v > T(-4) * numext::log(u) * u * u)); + + return v / u; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex RandomToTypeNormal >(uint64_t* state, + uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), RandomToTypeNormal(state, stream)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex RandomToTypeNormal >(uint64_t* state, + uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), RandomToTypeNormal(state, stream)); +} + +template +class NormalRandomGenerator { + public: + static constexpr bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); +#ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefore, we need two steps to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Therefore, the thread Id injection is not available at this stage. However when the operator() + // is called the thread ID will be available. So inside the operator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + // to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once = false; +#endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator& other) { + m_state = other.m_state; +#ifdef EIGEN_USE_SYCL + m_exec_once = other.m_exec_once; +#endif + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { +#ifdef EIGEN_USE_SYCL + if (!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once = true; + } +#endif + T result = RandomToTypeNormal(&m_state, i); + return result; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX T values[packetSize]; +#ifdef EIGEN_USE_SYCL + if (!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once = true; + } +#endif + EIGEN_UNROLL_LOOP + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeNormal(&m_state, i); + } + return internal::pload(values); + } + + private: + mutable uint64_t m_state; +#ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; +#endif +}; + +template +struct functor_traits > { + enum { + // On average, we need to generate about 3 random numbers + // 15 mul, 8 add, 1.5 logs + Cost = 3 * functor_traits >::Cost + 15 * NumTraits::AddCost + + 8 * NumTraits::AddCost + 3 * functor_traits >::Cost / 2, + PacketAccess = NormalRandomGenerator::PacketAccess + }; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h new file mode 100644 index 00000000000..2ecbb7c209e --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -0,0 +1,1038 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H + +// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, +// so we'll use a macro to make clang happy. +#ifndef KERNEL_FRIEND +#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) +#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 +#else +#define KERNEL_FRIEND friend +#endif +#endif + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorReduction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reduction class. + * + */ + +namespace internal { +template class MakePointer_> +struct traits > : traits { + typedef traits XprTraits; + typedef typename XprTraits::Scalar Scalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + static constexpr int NumDimensions = XprTraits::NumDimensions - array_size::value; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + + template + struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> { + typedef const TensorReductionOp& type; +}; + +template class MakePointer_> +struct nested, 1, + typename eval >::type> { + typedef TensorReductionOp type; +}; + +template +struct DimInitializer { + template + EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims, + const array::value>& reduced, + OutputDims* output_dims, ReducedDims* reduced_dims) { + const int NumInputDims = internal::array_size::value; + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + (*reduced_dims)[reduceIndex] = input_dims[i]; + ++reduceIndex; + } else { + (*output_dims)[outputIndex] = input_dims[i]; + ++outputIndex; + } + } + } +}; + +template <> +struct DimInitializer > { + template + EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims, const array&, Sizes<>*, + array* reduced_dims) { + const int NumInputDims = internal::array_size::value; + for (int i = 0; i < NumInputDims; ++i) { + (*reduced_dims)[i] = input_dims[i]; + } + } +}; + +template +struct are_inner_most_dims { + static const bool value = false; +}; +template +struct preserve_inner_most_dims { + static const bool value = false; +}; + +template +struct are_inner_most_dims { + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_eq(0, 0); + static const bool tmp3 = + index_statically_eq(array_size::value - 1, array_size::value - 1); + static const bool value = tmp1 & tmp2 & tmp3; +}; +template +struct are_inner_most_dims { + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_eq(0, NumTensorDims - array_size::value); + static const bool tmp3 = index_statically_eq(array_size::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2 & tmp3; +}; +template +struct preserve_inner_most_dims { + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_gt(0, 0); + static const bool value = tmp1 & tmp2; +}; +template +struct preserve_inner_most_dims { + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_lt(array_size::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2; +}; + +template +struct GenericDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, + Op& reducer, typename Self::CoeffReturnType* accum) { + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + GenericDimReducer::reduce(self, input, reducer, accum); + } + } +}; +template +struct GenericDimReducer<0, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, + Op& reducer, typename Self::CoeffReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reduce(self.m_impl.coeff(input), accum); + } + } +}; +template +struct GenericDimReducer<-1, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, + typename Self::CoeffReturnType* accum) { + reducer.reduce(self.m_impl.coeff(index), accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce( + const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalize(accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce( + const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) { + using Index = typename Self::Index; + constexpr Index packetSize = internal::unpacket_traits::size; + Index start = 0; + typename Self::PacketReturnType paccum0 = reducer0.template initializePacket(); + if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4 * packetSize) { + const Index VectorizedSize4 = (numValuesToReduce / (4 * packetSize)) * (4 * packetSize); + typename Self::PacketReturnType paccum1 = reducer0.template initializePacket(); + typename Self::PacketReturnType paccum2 = reducer0.template initializePacket(); + typename Self::PacketReturnType paccum3 = reducer0.template initializePacket(); + const Index offset0 = firstIndex; + const Index offset1 = firstIndex + packetSize; + const Index offset2 = firstIndex + 2 * packetSize; + const Index offset3 = firstIndex + 3 * packetSize; + for (Index j = 0; j < VectorizedSize4; j += 4 * packetSize) { + reducer0.reducePacket(self.m_impl.template packet(offset0 + j), &paccum0); + reducer0.reducePacket(self.m_impl.template packet(offset1 + j), &paccum1); + reducer0.reducePacket(self.m_impl.template packet(offset2 + j), &paccum2); + reducer0.reducePacket(self.m_impl.template packet(offset3 + j), &paccum3); + } + reducer0.reducePacket(paccum1, &paccum0); + reducer0.reducePacket(paccum2, &paccum0); + reducer0.reducePacket(paccum3, &paccum0); + start = VectorizedSize4; + } + if (start <= (numValuesToReduce - packetSize)) { + const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + for (Index j = start; j < VectorizedSize; j += packetSize) { + reducer0.reducePacket(self.m_impl.template packet(firstIndex + j), &paccum0); + } + start = VectorizedSize; + } + typename Self::CoeffReturnType accum = reducer0.initialize(); + for (Index j = start; j < numValuesToReduce; ++j) { + reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer0.finalizeBoth(accum, paccum0); + } +}; + +#if !defined(EIGEN_HIPCC) + +// The following implements tree-based reduction, which improves the accuracy +// of sum and mean reductions, since each of the n inputs only participates in +// O(log n) additions. +template +EIGEN_DEVICE_FUNC inline Index LeafSize() { + return 1024; +} +template <> +EIGEN_DEVICE_FUNC inline Index LeafSize() { + return 200; +} +template <> +EIGEN_DEVICE_FUNC inline Index LeafSize() { + return 128; +} + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce( + const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const Index kLeafSize = LeafSize(); + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > kLeafSize) { + const typename Self::Index half = numValuesToReduce / 2; + // Recursively reduce the two halves. + reducer.reduce(reduce(self, firstIndex, half, reducer), &accum); + reducer.reduce(reduce(self, firstIndex + half, numValuesToReduce - half, reducer), &accum); + return reducer.finalize(accum); + } else { + return InnerMostDimReducer::reduce(self, firstIndex, numValuesToReduce, reducer); + } + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce( + const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const Index kLeafSize = LeafSize(); + const typename Self::Index packetSize = internal::unpacket_traits::size; + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > packetSize * kLeafSize) { + // Make sure the split point is aligned on a packet boundary. + const typename Self::Index split = + packetSize * + numext::div_ceil(firstIndex + numext::div_ceil(numValuesToReduce, typename Self::Index(2)), packetSize); + const typename Self::Index num_left = numext::mini(split - firstIndex, numValuesToReduce); + reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum); + if (num_left < numValuesToReduce) { + reducer.reduce(reduce(self, split, numValuesToReduce - num_left, reducer), &accum); + } + return reducer.finalize(accum); + } else { + return InnerMostDimReducer::reduce(self, firstIndex, numValuesToReduce, reducer); + } + } +}; +#endif + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, + typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, + Op& reducer, typename Self::PacketReturnType* accum) { + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + InnerMostDimPreserver::reduce(self, input, reducer, accum); + } + } +}; + +template +struct InnerMostDimPreserver<0, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, + Op& reducer0, typename Self::PacketReturnType* accum0) { + using Index = typename Self::Index; + const Index stride = self.m_reducedStrides[0]; + const Index size = self.m_reducedDims[0]; + if (!Self::ReducerTraits::IsStateful && size >= 16) { + const Index unrolled_size4 = (size / 4) * 4; + typename Self::PacketReturnType accum1 = reducer0.template initializePacket(); + typename Self::PacketReturnType accum2 = reducer0.template initializePacket(); + typename Self::PacketReturnType accum3 = reducer0.template initializePacket(); + for (Index j = 0; j < unrolled_size4; j += 4) { + const Index input0 = firstIndex + j * stride; + reducer0.reducePacket(self.m_impl.template packet(input0), accum0); + const Index input1 = firstIndex + (j + 1) * stride; + reducer0.reducePacket(self.m_impl.template packet(input1), &accum1); + const Index input2 = firstIndex + (j + 2) * stride; + reducer0.reducePacket(self.m_impl.template packet(input2), &accum2); + const Index input3 = firstIndex + (j + 3) * stride; + reducer0.reducePacket(self.m_impl.template packet(input3), &accum3); + } + reducer0.reducePacket(accum1, accum0); + reducer0.reducePacket(accum2, accum0); + reducer0.reducePacket(accum3, accum0); + for (Index j = unrolled_size4; j < size; ++j) { + Index input = firstIndex + j * stride; + reducer0.reducePacket(self.m_impl.template packet(input), accum0); + } + } else { + for (Index j = 0; j < size; ++j) { + Index input = firstIndex + j * stride; + reducer0.reducePacket(self.m_impl.template packet(input), accum0); + } + } + } +}; +template +struct InnerMostDimPreserver<-1, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, + typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +// Default full reducer +template +struct FullReducer { + static constexpr bool HasOptimizedImplementation = false; + + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, + typename Self::EvaluatorPointerType output) { + const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + } +}; + +#ifdef EIGEN_USE_THREADS +// Multithreaded full reducers +template +struct FullReducerShard { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer, + typename Self::CoeffReturnType* output) { + *output = InnerMostDimReducer::reduce(self, firstIndex, numValuesToReduce, reducer); + } +}; + +// Multithreaded full reducer +template +struct FullReducer { + static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful; + static constexpr Index PacketSize = unpacket_traits::size; + + // launch one reducer per thread and accumulate the result. + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, + typename Self::CoeffReturnType* output) { + typedef typename Self::Index Index; + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + if (num_coeffs == 0) { + *output = reducer.finalize(reducer.initialize()); + return; + } + const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, PacketSize); + const Index num_threads = TensorCostModel::numThreads(num_coeffs, cost, device.numThreads()); + if (num_threads == 1) { + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + return; + } + const Index blocksize = num_coeffs / num_threads; + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); + + Barrier barrier(internal::convert_index(numblocks)); + MaxSizeVector shards(numblocks, reducer.initialize()); + for (Index i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, i * blocksize, + blocksize, reducer, &shards[i]); + } + typename Self::CoeffReturnType finalShard; + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer::reduce(self, numblocks * blocksize, + num_coeffs - numblocks * blocksize, reducer); + } else { + finalShard = reducer.initialize(); + } + barrier.Wait(); + + for (Index i = 0; i < numblocks; ++i) { + reducer.reduce(shards[i], &finalShard); + } + *output = reducer.finalize(finalShard); + } +}; + +#endif + +// Default inner reducer +template +struct InnerReducer { + static constexpr bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, + typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; + +// Default outer reducer +template +struct OuterReducer { + static constexpr bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, + typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; + +#ifdef EIGEN_USE_SYCL +// Default Generic reducer +template +struct GenericReducer { + static constexpr bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, + typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; +#endif + +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, + unsigned int*); + +#if defined(EIGEN_HAS_GPU_FP16) +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat( + R, const S, I_, internal::packet_traits::type*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, + internal::packet_traits::type*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); + +#endif + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +#endif + +/** + * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op. + * This allows the reduction to have a different type for the accumulator than the input data type. + * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input + * with the accumulator and the other for reducing two accumulators. + * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for + * some properties of the input. + */ +template +struct ReductionReturnType { +#if defined(EIGEN_USE_SYCL) + typedef std::remove_const_t().initialize())> type; +#else + typedef std::remove_const_t type; +#endif +}; + +} // end namespace internal + +template class MakePointer_> +class TensorReductionOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims) + : m_expr(expr), m_dims(dims) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) + : m_expr(expr), m_dims(dims), m_reducer(reducer) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& reducer() const { return m_reducer; } + + protected: + typename XprType::Nested m_expr; + const Dims m_dims; + const Op m_reducer; +}; + +template +struct TensorReductionEvaluatorBase; + +// Eval as rvalue +template class MakePointer_, typename Device> +struct TensorReductionEvaluatorBase, Device> { + typedef internal::reducer_traits ReducerTraits; + typedef Dims ReducedDims; + typedef TensorReductionOp XprType; + typedef typename XprType::Index Index; + typedef ArgType ChildType; + typedef typename TensorEvaluator::Dimensions InputDimensions; + static constexpr int NumInputDims = internal::array_size::value; + static constexpr int NumReducedDims = internal::array_size::value; + static constexpr int NumOutputDims = NumInputDims - NumReducedDims; + typedef std::conditional_t, DSizes > Dimensions; + typedef typename XprType::Scalar Scalar; + typedef TensorReductionEvaluatorBase, Device> Self; + static constexpr bool InputPacketAccess = TensorEvaluator::PacketAccess; + typedef typename internal::ReductionReturnType::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr Index PacketSize = PacketType::size; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + static constexpr int NumPreservedStrides = max_n_1::size; + + // For full reductions +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + static constexpr bool RunningOnGPU = internal::is_same::value; + static constexpr bool RunningOnSycl = false; +#elif defined(EIGEN_USE_SYCL) + static constexpr bool RunningOnSycl = internal::is_same, Eigen::SyclDevice>::value; + static constexpr bool RunningOnGPU = false; +#else + static constexpr bool RunningOnGPU = false; + static constexpr bool RunningOnSycl = false; +#endif + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, + BlockAccess = false, + PreferBlockAccess = true, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + static constexpr bool ReducingInnerMostDims = internal::are_inner_most_dims::value; + static constexpr bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; + static constexpr bool RunningFullReduction = (NumOutputDims == 0); + + EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) { + EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Build the bitmap indicating if an input dimension is reduced or not. + for (int i = 0; i < NumInputDims; ++i) { + m_reduced[i] = false; + } + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op.dims()[i] >= 0); + eigen_assert(op.dims()[i] < NumInputDims); + m_reduced[op.dims()[i]] = true; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + internal::DimInitializer::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); + + // Precompute output strides. + if (NumOutputDims > 0) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + } + } else { + m_outputStrides[static_cast(NumOutputDims - 1)] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + } + } + } + + // Precompute input strides. + if (NumInputDims > 0) { + array input_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i - 1] * input_dims[i - 1]; + } + } else { + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedStrides[reduceIndex] = input_strides[i]; + ++reduceIndex; + } else { + m_preservedStrides[outputIndex] = input_strides[i]; + m_output_to_input_dim_map[outputIndex] = i; + ++outputIndex; + } + } + } + + // Special case for full reductions + if (NumOutputDims == 0) { + m_preservedStrides[0] = internal::array_prod(input_dims); + } + + m_numValuesToReduce = NumOutputDims == 0 ? internal::array_prod(input_dims) + : (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[static_cast(NumOutputDims - 1)]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) { + // Use the FullReducer if possible. + if ((RunningFullReduction && RunningOnSycl) || + (RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && + ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || !RunningOnGPU))) { + bool need_assign = false; + if (!data) { + m_result = static_cast( + m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType)))); + data = m_result; + need_assign = true; + } + Op reducer(m_reducer); + internal::FullReducer::run(*this, reducer, m_device, data); + return need_assign; + } + + // Attempt to use an optimized reduction. + else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) { + bool reducing_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + reducing_inner_dims &= m_reduced[i]; + } else { + reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } + } + if (internal::InnerReducer::HasOptimizedImplementation && + (reducing_inner_dims || ReducingInnerMostDims)) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && + num_values_to_reduce > 128) || + (RunningOnSycl)) { + data = static_cast(m_device.get( + (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } else { + return true; + } + } + Op reducer(m_reducer); + // For SYCL this if always return false + if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, + num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } + } + + bool preserving_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } else { + preserving_inner_dims &= m_reduced[i]; + } + } + if (internal::OuterReducer::HasOptimizedImplementation && preserving_inner_dims) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && + num_values_to_reduce > 32) || + (RunningOnSycl)) { + data = static_cast(m_device.get( + (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } else { + return true; + } + } + Op reducer(m_reducer); + // For SYCL this if always return false + if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, + num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } + } +#if defined(EIGEN_USE_SYCL) + // If there is no Optimised version for SYCL, the reduction expression + // must break into two subexpression and use the SYCL generic Reducer on the device. + if (RunningOnSycl) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + data = static_cast( + m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } + Op reducer(m_reducer); + internal::GenericReducer::run(*this, reducer, m_device, data, num_values_to_reduce, + num_coeffs_to_preserve); + return (m_result != NULL); + } +#endif + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { done(evalSubExprsIfNeededCommon(data)); }); + } +#endif + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + return evalSubExprsIfNeededCommon(data); + } + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + if ((RunningFullReduction || RunningOnGPU) && m_result) { + return *(m_result + index); + } + Op reducer(m_reducer); + if (ReducingInnerMostDims || RunningFullReduction) { + const Index num_values_to_reduce = (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumPreservedStrides - 1]; + return internal::InnerMostDimReducer::reduce(*this, firstInput(index), num_values_to_reduce, reducer); + } else { + typename Self::CoeffReturnType accum = reducer.initialize(); + internal::GenericDimReducer::reduce(*this, firstInput(index), reducer, &accum); + return reducer.finalize(accum); + } + } + + // TODO(bsteiner): provide a more efficient implementation. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); + + if (RunningOnGPU && m_result) { + return internal::pload(m_result + index); + } + + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumPreservedStrides - 1]; + const Index firstIndex = firstInput(index); + for (Index i = 0; i < PacketSize; ++i) { + Op reducer(m_reducer); + values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, + num_values_to_reduce, reducer); + } + } else if (PreservingInnerMostDims) { + const Index firstIndex = firstInput(index); + const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; + // TBD: extend this the the n innermost dimensions that we preserve. + if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { + Op reducer(m_reducer); + typename Self::PacketReturnType accum = reducer.template initializePacket(); + internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); + return reducer.finalizePacket(accum); + } else { + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + } + } else { + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + // Must be called after evalSubExprsIfNeeded(). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + if (RunningFullReduction && m_result) { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } else { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const double compute_cost = num_values_to_reduce * internal::functor_traits::Cost; + return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } + + private: + template + friend struct internal::GenericDimReducer; + template + friend struct internal::InnerMostDimReducer; + template + friend struct internal::InnerMostDimPreserver; + template + friend struct internal::FullReducer; +#ifdef EIGEN_USE_THREADS + template + friend struct internal::FullReducerShard; +#endif +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + template + KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); +#if defined(EIGEN_HAS_GPU_FP16) + template + KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, + internal::packet_traits::type*); + template + KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, + internal::packet_traits::type*); + template + KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); +#endif + template + KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); + + template + KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +#endif + +#if defined(EIGEN_USE_SYCL) + template + friend class TensorSycl::internal::GenericNondeterministicReducer; + // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer + template + friend struct internal::GenericReducer; +#endif + + template + friend struct internal::InnerReducer; + + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + + // Returns the Index in the input tensor of the first value that needs to be + // used to compute the reduction at output index "index". + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + if (ReducingInnerMostDims) { + if (static_cast(Layout) == static_cast(ColMajor)) { + return index * m_preservedStrides[0]; + } else { + return index * m_preservedStrides[NumPreservedStrides - 1]; + } + } + // TBD: optimize the case where we preserve the innermost dimensions. + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (PreservingInnerMostDims) { + eigen_assert(m_preservedStrides[0] == 1); + startInput += index; + } else { + startInput += index * m_preservedStrides[0]; + } + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (PreservingInnerMostDims) { + eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1); + startInput += index; + } else { + startInput += index * m_preservedStrides[NumPreservedStrides - 1]; + } + } + return startInput; + } + + // Bitmap indicating if an input dimension is reduced or not. + array m_reduced; + // Dimensions of the output of the operation. + Dimensions m_dimensions; + // Precomputed strides for the output tensor. + // Avoid zero-sized arrays, since element access fails to compile on GPU. + array m_outputStrides; + array, (std::max)(NumOutputDims, 1)> m_fastOutputStrides; + array m_preservedStrides; + // Map from output to input dimension index. + array m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; + + // Subset of strides of the input tensor for the reduced dimensions. + // Indexed by reduced dimensions. + array m_reducedStrides; + // Size of the input dimensions that are reduced. + // Indexed by reduced dimensions. + array m_reducedDims; + + // Evaluator for the input expression. + TensorEvaluator m_impl; + + // Operation to apply for computing the reduction. + Op m_reducer; + + EvaluatorPointerType m_result; + + const Device EIGEN_DEVICE_REF m_device; +}; + +template class MakePointer_, typename Device> +struct TensorEvaluator, Device> + : public TensorReductionEvaluatorBase, Device> { + typedef TensorReductionEvaluatorBase, Device> Base; + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device) {} +}; + +template class MakePointer_> +struct TensorEvaluator, Eigen::SyclDevice> + : public TensorReductionEvaluatorBase, Eigen::SyclDevice> { + typedef TensorReductionEvaluatorBase, Eigen::SyclDevice> + Base; + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) + : Base(op, device) {} + // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL + // kernel + // Therefore the coeff function should be overridden by for SYCL kernel + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { + return *(this->data() + index); + } + // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL + // kernel + // Therefore the packet function should be overridden by for SYCL kernel + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const { + return internal::pload(this->data() + index); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h new file mode 100644 index 00000000000..c5273e9b82d --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -0,0 +1,958 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple gpu thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another gpu thread +// updated the content of the output address it will try again. +template +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + if (sizeof(T) == 4) { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS(reinterpret_cast(output), oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } else { + gpu_assert(0 && "Wordsize not supported"); + } +#else // EIGEN_CUDA_ARCH >= 300 + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(accum); + EIGEN_UNUSED_VARIABLE(reducer); + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +// We extend atomicExch to support extra data types +template +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#ifdef EIGEN_HAS_GPU_FP16 +template +__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reducePacket(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reducePacket(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } +} +#ifdef EIGEN_GPU_COMPILE_PHASE +// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations +template +__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) { + half2* houtput = reinterpret_cast(output); + half2* haccum = reinterpret_cast(&accum); + for (int i = 0; i < 4; ++i) { + atomicReduce(houtput + i, *(haccum + i), reducer); + } +} +#endif // EIGEN_GPU_COMPILE_PHASE +#endif // EIGEN_HAS_GPU_FP16 + +template <> +__device__ inline void atomicReduce(float* output, float accum, SumReducer&) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + atomicAdd(output, accum); +#else // EIGEN_CUDA_ARCH >= 300 + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(accum); + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, + CoeffType* output) { + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const Index num_threads = blockDim.x * gridDim.x; + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = val; + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, + typename Self::CoeffReturnType* output, + unsigned int* semaphore) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + // Initialize the output value + const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; + if (gridDim.x == 1) { + if (first_index == 0) { + *output = reducer.initialize(); + } + } else { + if (threadIdx.x == 0) { + unsigned int block = atomicCAS(semaphore, 0u, 1u); + if (block == 0) { + // We're the first block to run, initialize the output value + atomicExchCustom(output, reducer.initialize()); + __threadfence(); + atomicExch(semaphore, 2u); + } else { + // Wait for the first block to initialize the output value. + // Use atomicCAS here to ensure that the reads aren't cached + unsigned int val; + do { + val = atomicCAS(semaphore, 2u, 2u); + } while (val < 2u); + } + } + } + + __syncthreads(); + + eigen_assert(gridDim.x == 1 || *semaphore >= 2u); + + typename Self::CoeffReturnType accum = reducer.initialize(); + Index max_iter = numext::mini(num_coeffs - first_index, NumPerThread * BlockSize); + for (Index i = 0; i < max_iter; i += BlockSize) { + const Index index = first_index + i; + eigen_assert(index < num_coeffs); + typename Self::CoeffReturnType val = input.m_impl.coeff(index); + reducer.reduce(val, &accum); + } + +#pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { +#if defined(EIGEN_HIPCC) + // use std::is_floating_point to determine the type of reduced_val + // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error + // and list the float and int versions of __shfl_down as the candidate functions. + if (std::is_floating_point::value) { + reducer.reduce(__shfl_down(static_cast(accum), offset, warpSize), &accum); + } else { + reducer.reduce(__shfl_down(static_cast(accum), offset, warpSize), &accum); + } +#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); +#else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); +#endif + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(output, accum, reducer); + } + + if (gridDim.x > 1 && threadIdx.x == 0) { + // Let the last block reset the semaphore + atomicInc(semaphore, gridDim.x + 1); +#if defined(EIGEN_HIPCC) + __threadfence_system(); +#endif + } +#else // EIGEN_CUDA_ARCH >= 300 + EIGEN_UNUSED_VARIABLE(reducer); + EIGEN_UNUSED_VARIABLE(input); + EIGEN_UNUSED_VARIABLE(num_coeffs); + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(semaphore); + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +#ifdef EIGEN_HAS_GPU_FP16 +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, + Index num_coeffs, half* scratch) { + eigen_assert(blockDim.x == 1); + eigen_assert(gridDim.x == 1); + typedef packet_traits::type packet_type; + Index packet_remainder = num_coeffs % Index(unpacket_traits::size); + if (packet_remainder != 0) { + half2* h2scratch = reinterpret_cast(scratch); + for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) { + *h2scratch = __halves2half2(input.coeff(i), input.coeff(i + 1)); + h2scratch++; + } + if ((num_coeffs & 1) != 0) { + half lastCoeff = input.coeff(num_coeffs - 1); + *h2scratch = __halves2half2(lastCoeff, reducer.initialize()); + } + } else { + packet_type reduce = reducer.template initializePacket(); + internal::pstoreu(scratch, reduce); + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/, + Index num_coeffs, half* output) { + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const Index num_threads = blockDim.x * gridDim.x; + typedef typename packet_traits::type PacketType; + + const Index num_packets = num_coeffs / Index(unpacket_traits::size); + PacketType* p_output = reinterpret_cast(output); + for (Index i = thread_id; i < num_packets; i += num_threads) { + p_output[i] = reducer.template initializePacket(); + } + Index packet_remainder = num_coeffs % Index(unpacket_traits::size); + if (thread_id < packet_remainder) { + output[num_coeffs - packet_remainder + thread_id] = reducer.initialize(); + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, + Index num_coeffs, half* output, + half* scratch) { + typedef typename packet_traits::type PacketType; + const int packet_width = unpacket_traits::size; + eigen_assert(NumPerThread % packet_width == 0); + const Index first_index = blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x; + + // Initialize the output value if it wasn't initialized by the ReductionInitKernel + + if (gridDim.x == 1) { + if (first_index == 0) { + int rem = num_coeffs % packet_width; + if (rem != 0) { + half2* p_scratch = reinterpret_cast(scratch); + pstoreu(scratch, reducer.template initializePacket()); + for (int i = 0; i < rem / 2; i++) { + *p_scratch = __halves2half2(input.coeff(num_coeffs - packet_width + 2 * i), + input.coeff(num_coeffs - packet_width + 2 * i + 1)); + p_scratch++; + } + if ((num_coeffs & 1) != 0) { + half last = input.coeff(num_coeffs - 1); + *p_scratch = __halves2half2(last, reducer.initialize()); + } + } else { + PacketType reduce = reducer.template initializePacket(); + pstoreu(scratch, reduce); + } + } + __syncthreads(); + } + + PacketType accum = reducer.template initializePacket(); + const Index max_iter = + numext::mini((num_coeffs - first_index) / packet_width, NumPerThread * BlockSize / packet_width); + for (Index i = 0; i < max_iter; i += BlockSize) { + const Index index = first_index + packet_width * i; + eigen_assert(index + packet_width < num_coeffs); + PacketType val = input.template packet(index); + reducer.reducePacket(val, &accum); + } + +#pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { +#if defined(EIGEN_HIPCC) + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { + int i; + half2 h; + } wka_in, wka_out; + wka_in.h = hacc[i]; + wka_out.i = __shfl_down(wka_in.i, offset, warpSize); + hr[i] = wka_out.h; + } + reducer.reducePacket(r1, &accum); +#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + hr[i] = __shfl_down(hacc[i], offset, warpSize); + } + reducer.reducePacket(r1, &accum); +#else + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize); + } + reducer.reducePacket(r1, &accum); + +#endif + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(reinterpret_cast(scratch), accum, reducer); + } + + __syncthreads(); + half2* rv1 = reinterpret_cast(scratch); + if (packet_width > 2) { + reducer.reducePacket(rv1[2], rv1); + reducer.reducePacket(rv1[3], rv1 + 1); + reducer.reducePacket(rv1[1], rv1); + } + if (gridDim.x == 1) { + if (first_index == 0) { + half tmp = __low2half(*rv1); + reducer.reduce(__high2half(*rv1), &tmp); + *output = tmp; + } + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) { + eigen_assert(threadIdx.x == 1); + typedef packet_traits::type packet_type; + if (unpacket_traits::size == 1) { + *output = *scratch; + } else { + half2* pscratch = reinterpret_cast(scratch); + half tmp = __float2half(0.f); + for (int i = 0; i < unpacket_traits::size; i += 2) { + reducer.reduce(__low2half(*pscratch), &tmp); + reducer.reduce(__high2half(*pscratch), &tmp); + pscratch++; + } + *output = tmp; + } +} + +#endif // EIGEN_HAS_GPU_FP16 + +template +struct FullReductionLauncher { + static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { + gpu_assert(false && "Should only be called on doubles, floats and half floats"); + } +}; + +// Specialization for float and double +template +struct FullReductionLauncher< + Self, Op, OutputType, PacketAccess, + std::enable_if_t::value || internal::is_same::value, + void>> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, + typename Self::Index num_coeffs) { + typedef typename Self::Index Index; + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); + + unsigned int* semaphore = NULL; + if (num_blocks > 1) { + semaphore = device.semaphore(); + } + + LAUNCH_GPU_KERNEL((FullReductionKernel), num_blocks, block_size, 0, + device, reducer, self, num_coeffs, output, semaphore); + } +}; + +#ifdef EIGEN_HAS_GPU_FP16 +template +struct FullReductionLauncher { + static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { + gpu_assert(false && "Should not be called since there is no packet accessor"); + } +}; + +template +struct FullReductionLauncher { + static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, + typename Self::Index num_coeffs) { + typedef typename Self::Index Index; + + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); + half* scratch = static_cast(device.scratchpad()); + + if (num_blocks > 1) { + // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat), 1, 1, 0, device, reducer, self, + num_coeffs, scratch); + } + + LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat), num_blocks, + block_size, 0, device, reducer, self, num_coeffs, output, scratch); + + if (num_blocks > 1) { + LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat), 1, 1, 0, device, reducer, output, scratch); + } + } +}; +#endif // EIGEN_HAS_GPU_FP16 + +template +struct FullReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple cases + // of doubles, floats and half floats +#ifdef EIGEN_HAS_GPU_FP16 + static constexpr bool HasOptimizedImplementation = + !Self::ReducerTraits::IsStateful && (internal::is_same::value || + internal::is_same::value || + (internal::is_same::value && + reducer_traits::PacketAccess)); +#else // EIGEN_HAS_GPU_FP16 + static constexpr bool HasOptimizedImplementation = + !Self::ReducerTraits::IsStateful && (internal::is_same::value || + internal::is_same::value); +#endif // EIGEN_HAS_GPU_FP16 + + template + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return; + } + + FullReductionLauncher::PacketAccess>::run(self, reducer, device, + output, num_coeffs); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, + Index num_coeffs_to_reduce, + Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + typedef typename Self::CoeffReturnType Type; + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + + const Index input_col_blocks = numext::div_ceil(num_coeffs_to_reduce, blockDim.x * NumPerThread); + const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = i / input_col_blocks; + + if (row < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; + + Type reduced_val = reducer.initialize(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); + if (last_col >= num_coeffs_to_reduce) { + for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { + const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + reducer.reduce(val, &reduced_val); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k); + reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val); + } + } + } + +#pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { +#if defined(EIGEN_HIPCC) + // use std::is_floating_point to determine the type of reduced_val + // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error + // and list the float and int versions of __shfl_down as the candidate functions. + if (std::is_floating_point::value) { + reducer.reduce(__shfl_down(static_cast(reduced_val), offset), &reduced_val); + } else { + reducer.reduce(__shfl_down(static_cast(reduced_val), offset), &reduced_val); + } +#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); +#else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); +#endif + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(&(output[row]), reduced_val, reducer); + } + } + } +#else // EIGEN_CUDA_ARCH >= 300 + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +#ifdef EIGEN_HAS_GPU_FP16 + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, + Index num_coeffs_to_reduce, + Index num_preserved_coeffs, half* output) { + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + typedef typename packet_traits::type PacketType; + const int packet_width = unpacket_traits::size; + const int unroll_times = 16 / packet_width; + eigen_assert(NumPerThread % unroll_times == 0); + eigen_assert(unroll_times % 2 == 0); + + const Index input_col_blocks = numext::div_ceil(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); + const Index num_input_blocks = numext::div_ceil(input_col_blocks * num_preserved_coeffs, 2); + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + Index i = packet_width * thread_id; + for (; i + packet_width <= num_preserved_coeffs; i += packet_width * num_threads) { + PacketType* poutput = reinterpret_cast(output + i); + *poutput = reducer.template initializePacket(); + } + if (i < num_preserved_coeffs) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows + + if (row + 1 < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x); + + PacketType reduced_val1 = reducer.template initializePacket(); + PacketType reduced_val2 = reducer.template initializePacket(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * packet_width; + if (last_col >= num_coeffs_to_reduce) { + Index col = col_begin + blockDim.x * j; + for (; col + packet_width <= num_coeffs_to_reduce; col += blockDim.x) { + const PacketType val1 = input.m_impl.template packet(row * num_coeffs_to_reduce + col); + reducer.reducePacket(val1, &reduced_val1); + const PacketType val2 = input.m_impl.template packet((row + 1) * num_coeffs_to_reduce + col); + reducer.reducePacket(val2, &reduced_val2); + } + if (col < num_coeffs_to_reduce) { + PacketType r1 = reducer.template initializePacket(); + PacketType r2 = reducer.template initializePacket(); + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + while (col + 1 < num_coeffs_to_reduce) { + *hr1 = __halves2half2(input.m_impl.coeff(row * num_coeffs_to_reduce + col), + input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1)); + *hr2 = __halves2half2(input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col), + input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + 1)); + hr1++; + hr2++; + col += 2; + } + if (col < num_coeffs_to_reduce) { + // Peel; + const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + *hr1 = __halves2half2(last1, reducer.initialize()); + const half last2 = input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col); + *hr2 = __halves2half2(last2, reducer.initialize()); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k) * packet_width; + reducer.reducePacket(input.m_impl.template packet(row * num_coeffs_to_reduce + col), + &reduced_val1); + reducer.reducePacket(input.m_impl.template packet((row + 1) * num_coeffs_to_reduce + col), + &reduced_val2); + } + } + } + +#pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { +#if defined(EIGEN_HIPCC) + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + for (int i = 0; i < packet_width / 2; i++) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { + int i; + half2 h; + } wka_in1, wka_out1; + wka_in1.h = rv1[i]; + wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize); + hr1[i] = wka_out1.h; + + union { + int i; + half2 h; + } wka_in2, wka_out2; + wka_in2.h = rv2[i]; + wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize); + hr2[i] = wka_out2.h; + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); +#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + for (int i = 0; i < packet_width / 2; i++) { + hr1[i] = __shfl_down(rv1[i], offset, warpSize); + hr2[i] = __shfl_down(rv2[i], offset, warpSize); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); +#else + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rr1 = reinterpret_cast(&reduced_val1); + half2* rr2 = reinterpret_cast(&reduced_val2); + for (int j = 0; j < packet_width / 2; j++) { + hr1[j] = __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize); + hr2[j] = __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); + +#endif + } + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + half2 val; + if (packet_width > 2) { + reducer.reducePacket(rv1[2], rv1); + reducer.reducePacket(rv1[3], rv1 + 1); + reducer.reducePacket(rv1[1], rv1); + reducer.reducePacket(rv2[2], rv2); + reducer.reducePacket(rv2[3], rv2 + 1); + reducer.reducePacket(rv2[1], rv2); + } + half val1 = __low2half(*rv1); + reducer.reduce(__high2half(*rv1), &val1); + half val2 = __low2half(*rv2); + reducer.reduce(__high2half(*rv2), &val2); + val = __halves2half2(val1, val2); + if ((threadIdx.x & (warpSize - 1)) == 0) { + half* loc = output + row; + atomicReduce(reinterpret_cast(loc), val, reducer); + } + } + } +} + +#endif // EIGEN_HAS_GPU_FP16 + +template +struct InnerReductionLauncher { + static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, + typename Self::Index) { + gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); + return true; + } +}; + +// Specialization for float and double +template +struct InnerReductionLauncher< + Self, Op, OutputType, PacketAccess, + std::enable_if_t::value || internal::is_same::value, + void>> { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, + typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 128; + const int dyn_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + const int dyn_blocks2 = numext::div_ceil(num_preserved_vals, 1024); + const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024; + const int num_blocks2 = numext::mini(max_blocks2, dyn_blocks2); + LAUNCH_GPU_KERNEL((ReductionInitKernel), num_blocks2, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } + + LAUNCH_GPU_KERNEL((InnerReductionKernel), num_blocks, block_size, 0, device, + reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; + +#ifdef EIGEN_HAS_GPU_FP16 +template +struct InnerReductionLauncher { + static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { + gpu_assert(false && "Should not be called since there is no packet accessor"); + return true; + } +}; + +template +struct InnerReductionLauncher { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, + typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + if (num_preserved_vals % 2 != 0) { + // Not supported yet, revert to the slower code path + return true; + } + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = /*256*/ 128; + const int num_per_thread = /*128*/ 64; + const int dyn_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat), 1, 1, 0, device, reducer, self, + num_preserved_vals, output); + } + + LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat), num_blocks, block_size, 0, + device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; +#endif // EIGEN_HAS_GPU_FP16 + +template +struct InnerReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats and half floats. +#ifdef EIGEN_HAS_GPU_FP16 + static constexpr bool HasOptimizedImplementation = + !Self::ReducerTraits::IsStateful && (internal::is_same::value || + internal::is_same::value || + (internal::is_same::value && + reducer_traits::PacketAccess)); +#else // EIGEN_HAS_GPU_FP16 + static constexpr bool HasOptimizedImplementation = + !Self::ReducerTraits::IsStateful && (internal::is_same::value || + internal::is_same::value); +#endif // EIGEN_HAS_GPU_FP16 + + template + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, + typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return true; + } + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 128) { + return true; + } + + return InnerReductionLauncher::PacketAccess>::run( + self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, + Index num_coeffs_to_reduce, + Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + // Do the reduction. + const Index max_iter = num_preserved_coeffs * numext::div_ceil(num_coeffs_to_reduce, NumPerThread); + for (Index i = thread_id; i < max_iter; i += num_threads) { + const Index input_col = i % num_preserved_coeffs; + const Index input_row = (i / num_preserved_coeffs) * NumPerThread; + typename Self::CoeffReturnType reduced_val = reducer.initialize(); + const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce); + for (Index j = input_row; j < max_row; j++) { + typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col); + reducer.reduce(val, &reduced_val); + } + atomicReduce(&(output[input_col]), reduced_val, reducer); + } +} + +template +struct OuterReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats. + static constexpr bool HasOptimizedImplementation = + !Self::ReducerTraits::IsStateful && (internal::is_same::value || + internal::is_same::value); + template + static +#if !defined(EIGEN_HIPCC) + // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error + // (in the cxx11_tensor_reduction_gpu test) + // + // terminate called after throwing an instance of 'std::runtime_error' + // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL... + // + // don't know why this happens (and why is it a runtime error instead of a compile time error) + // + // this will be fixed by HIP PR#457 + EIGEN_DEVICE_FUNC +#endif + bool + run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device"); + return true; + } + + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, + typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 32) { + return true; + } + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 16; + const int dyn_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs in the reduction kernel itself when we don't have to worry + // about race conditions between multiple thread blocks. + const int dyn_blocks2 = numext::div_ceil(num_preserved_vals, 1024); + const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024; + const int num_blocks2 = numext::mini(max_blocks2, dyn_blocks2); + LAUNCH_GPU_KERNEL((ReductionInitKernel), num_blocks2, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } + + LAUNCH_GPU_KERNEL((OuterReductionKernel), num_blocks, block_size, 0, device, + reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; + +#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h new file mode 100644 index 00000000000..6944c033708 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -0,0 +1,588 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorReductionSycl.h + * + * \brief: + * This is the specialization of the reduction operation. Two phase reduction approach + * is used since the GPU does not have Global Synchronization for global memory among + * different work-group/thread block. To solve the problem, we need to create two kernels + * to reduce the data, where the first kernel reduce the data locally and each local + * workgroup/thread-block save the input data into global memory. In the second phase (global reduction) + * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. + * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU: + * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf + * + *****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +template +struct OpDefiner { + typedef typename Vectorise::PacketReturnType PacketReturnType; + typedef Op type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, + const Index &) { + return accumulator; + } +}; + +template +struct OpDefiner, CoeffReturnType, Index, false> { + typedef Eigen::internal::SumReducer type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer &) { + return type(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, + const Index &scale) { + ::Eigen::internal::scalar_quotient_op quotient_op; + return quotient_op(accumulator, CoeffReturnType(scale)); + } +}; + +template +struct OpDefiner, CoeffReturnType, Index, true> { + typedef typename Vectorise::PacketReturnType PacketReturnType; + typedef Eigen::internal::SumReducer type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer &) { + return type(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, + const Index &scale) { + return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1(CoeffReturnType(scale))); + } +}; + +template +struct SecondStepFullReducer { + typedef cl::sycl::accessor + LocalAccessor; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + LocalAccessor scratch; + InputAccessor aI; + OutputAccessor outAcc; + Op op; + SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_) + : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {} + + void operator()(cl::sycl::nd_item<1> itemID) const { + // Our empirical research shows that the best performance will be achieved + // when there is only one element per thread to reduce in the second step. + // in this step the second step reduction time is almost negligible. + // Hence, in the second step of reduction the input size is fixed to the + // local size, thus, there is only one element read per thread. The + // algorithm must be changed if the number of reduce per thread in the + // second step is greater than 1. Otherwise, the result will be wrong. + const Index localid = itemID.get_local_id(0); + auto aInPtr = aI + localid; + auto aOutPtr = outAcc; + CoeffReturnType *scratchptr = scratch.get_pointer(); + CoeffReturnType accumulator = *aInPtr; + + scratchptr[localid] = op.finalize(accumulator); + for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.reduce(scratchptr[localid + offset], &accumulator); + scratchptr[localid] = op.finalize(accumulator); + } + } + if (localid == 0) *aOutPtr = op.finalize(accumulator); + } +}; + +// Full reduction first phase. In this version the vectorization is true and the reduction accept +// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ). +template +class FullReductionKernelFunctor { + public: + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::Index Index; + typedef OpDefiner + OpDef; + + typedef typename OpDef::type Op; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::PacketReturnType PacketReturnType; + typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), PacketReturnType, + CoeffReturnType> + OutType; + typedef cl::sycl::accessor + LocalAccessor; + LocalAccessor scratch; + Evaluator evaluator; + EvaluatorPointerType final_output; + Index rng; + Op op; + + FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, + Index rng_, OpType op_) + : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {} + + void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t compute_reduction( + const cl::sycl::nd_item<1> &itemID) const { + auto output_ptr = final_output; + Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize; + Index globalid = itemID.get_global_id(0); + Index localid = itemID.get_local_id(0); + Index step = Evaluator::PacketSize * itemID.get_global_range(0); + Index start = Evaluator::PacketSize * globalid; + // vectorizable parts + PacketReturnType packetAccumulator = op.template initializePacket(); + for (Index i = start; i < VectorizedRange; i += step) { + op.template reducePacket(evaluator.impl().template packet(i), &packetAccumulator); + } + globalid += VectorizedRange; + // non vectorizable parts + for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { + op.template reducePacket( + ::Eigen::TensorSycl::internal::PacketWrapper::convert_to_packet_type( + evaluator.impl().coeff(i), op.initialize()), + &packetAccumulator); + } + scratch[localid] = packetAccumulator = + OpDef::finalise_op(op.template finalizePacket(packetAccumulator), rng); + // reduction parts // Local size is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = local_range / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.template reducePacket(scratch[localid + offset], &packetAccumulator); + scratch[localid] = op.template finalizePacket(packetAccumulator); + } + } + if (localid == 0) { + output_ptr[itemID.get_group(0)] = + op.finalizeBoth(op.initialize(), op.template finalizePacket(packetAccumulator)); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t compute_reduction( + const cl::sycl::nd_item<1> &itemID) const { + auto output_ptr = final_output; + Index globalid = itemID.get_global_id(0); + Index localid = itemID.get_local_id(0); + // vectorizable parts + CoeffReturnType accumulator = op.initialize(); + // non vectorizable parts + for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { + op.reduce(evaluator.impl().coeff(i), &accumulator); + } + scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng); + + // reduction parts. the local size is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = local_range / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.reduce(scratch[localid + offset], &accumulator); + scratch[localid] = op.finalize(accumulator); + } + } + if (localid == 0) { + output_ptr[itemID.get_group(0)] = op.finalize(accumulator); + } + } +}; + +template +class GenericNondeterministicReducer { + public: + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::Index Index; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + template + GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, + Index range_, Index num_values_to_reduce_) + : evaluator(evaluator_), + output_accessor(output_accessor_), + functor(OpDef::get_op(functor_)), + range(range_), + num_values_to_reduce(num_values_to_reduce_) {} + + void operator()(cl::sycl::nd_item<1> itemID) const { + // This is to bypass the statefull condition in Eigen meanReducer + Op non_const_functor; + std::memcpy(&non_const_functor, &functor, sizeof(Op)); + auto output_accessor_ptr = output_accessor; + Index globalid = static_cast(itemID.get_global_linear_id()); + if (globalid < range) { + CoeffReturnType accum = functor.initialize(); + Eigen::internal::GenericDimReducer::reduce( + evaluator, evaluator.firstInput(globalid), non_const_functor, &accum); + output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce); + } + } + + private: + Evaluator evaluator; + EvaluatorPointerType output_accessor; + Op functor; + Index range; + Index num_values_to_reduce; +}; + +enum class reduction_dim { inner_most, outer_most }; +// default is preserver +template +struct PartialReductionKernel { + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::Index Index; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + typedef cl::sycl::accessor + ScratchAcc; + ScratchAcc scratch; + Evaluator evaluator; + EvaluatorPointerType output_accessor; + Op op; + const Index preserve_elements_num_groups; + const Index reduce_elements_num_groups; + const Index num_coeffs_to_preserve; + const Index num_coeffs_to_reduce; + + PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, + const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, + const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_) + : scratch(scratch_), + evaluator(evaluator_), + output_accessor(output_accessor_), + op(OpDef::get_op(op_)), + preserve_elements_num_groups(preserve_elements_num_groups_), + reduce_elements_num_groups(reduce_elements_num_groups_), + num_coeffs_to_preserve(num_coeffs_to_preserve_), + num_coeffs_to_reduce(num_coeffs_to_reduce_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, + CoeffReturnType &accumulator) const { + if (globalPId >= num_coeffs_to_preserve) { + return; + } + Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve) + : globalRId + (globalPId * num_coeffs_to_reduce); + Index localOffset = globalRId; + + const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups; + const Index per_thread_global_stride = + rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride; + for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) { + op.reduce(evaluator.impl().coeff(global_offset), &accumulator); + localOffset += per_thread_local_stride; + global_offset += per_thread_global_stride; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + const Index linearLocalThreadId = itemID.get_local_id(0); + Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP + : linearLocalThreadId / PannelParameters::LocalThreadSizeR; + Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP + : linearLocalThreadId % PannelParameters::LocalThreadSizeR; + const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups + : itemID.get_group(0) / reduce_elements_num_groups; + const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups + : itemID.get_group(0) % reduce_elements_num_groups; + + Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; + const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId; + CoeffReturnType *scratchPtr = scratch.get_pointer(); + auto outPtr = output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0); + CoeffReturnType accumulator = op.initialize(); + + element_wise_reduce(globalRId, globalPId, accumulator); + + accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce); + scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] = + accumulator; + if (rt == reduction_dim::inner_most) { + pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP; + rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP; + globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; + } + + /* Apply the reduction operation between the current local + * id and the one on the other half of the vector. */ + auto out_scratch_ptr = + scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC))); + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (rt == reduction_dim::inner_most) { + accumulator = *out_scratch_ptr; + } + // The Local LocalThreadSizeR is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) { + if (rLocalThreadId < offset) { + op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator); + // The result has already been divided for mean reducer in the + // previous reduction so no need to divide furthermore + *out_scratch_ptr = op.finalize(accumulator); + } + /* All threads collectively read from global memory into local. + * The barrier ensures all threads' IO is resolved before + * execution continues (strictly speaking, all threads within + * a single work-group - there is no co-ordination between + * work-groups, only work-items). */ + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) { + outPtr[globalPId] = op.finalize(accumulator); + } + } +}; + +template +struct SecondStepPartialReduction { + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + typedef cl::sycl::accessor + ScratchAccessor; + InputAccessor input_accessor; + OutputAccessor output_accessor; + Op op; + const Index num_coeffs_to_preserve; + const Index num_coeffs_to_reduce; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, + OutputAccessor output_accessor_, OpType op_, + const Index num_coeffs_to_preserve_, + const Index num_coeffs_to_reduce_) + : input_accessor(input_accessor_), + output_accessor(output_accessor_), + op(OpDef::get_op(op_)), + num_coeffs_to_preserve(num_coeffs_to_preserve_), + num_coeffs_to_reduce(num_coeffs_to_reduce_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + const Index globalId = itemID.get_global_id(0); + + if (globalId >= num_coeffs_to_preserve) return; + + auto in_ptr = input_accessor + globalId; + + OutScalar accumulator = op.initialize(); + // num_coeffs_to_reduce is not bigger that 256 + for (Index i = 0; i < num_coeffs_to_reduce; i++) { + op.reduce(*in_ptr, &accumulator); + in_ptr += num_coeffs_to_preserve; + } + output_accessor[globalId] = op.finalize(accumulator); + } +}; // namespace internal + +template +struct ReductionPannel { + static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP; + static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR; + static EIGEN_CONSTEXPR bool BC = BC_; +}; + +template +struct PartialReducerLauncher { + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::Storage Storage; + typedef typename Self::Index Index; + typedef ReductionPannel + PannelParameters; + + typedef PartialReductionKernel SyclReducerKerneType; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, + Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) { + Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP); + + // getPowerOfTwo makes sure local range is power of 2 and <= + // maxSyclThreadPerBlock this will help us to avoid extra check on the + // kernel + static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) & + (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + + EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR; + // In this step, we force the code not to be more than 2-step reduction: + // Our empirical research shows that if each thread reduces at least 64 + // elements individually, we get better performance. However, this can change + // on different platforms. In this step we force the code not to be + // morthan step reduction: Our empirical research shows that for inner_most + // dim reducer, it is better to have 8 group in a reduce dimension for sizes + // > 1024 to achieve the best performance. + const Index reductionPerThread = 64; + Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true); + const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP; + Index rGroups = (cu + pNumGroups - 1) / pNumGroups; + const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1; + const Index globalRange = pNumGroups * rNumGroups * localRange; + + EIGEN_CONSTEXPR Index scratchSize = + PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC); + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (rNumGroups > 1) { + CoeffReturnType *temp_pointer = static_cast( + dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType temp_accessor = dev.get(temp_pointer); + dev.template unary_kernel_launcher( + self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, + num_coeffs_to_reduce) + .wait(); + typedef SecondStepPartialReduction + SecondStepPartialReductionKernel; + dev.template unary_kernel_launcher( + temp_accessor, output, + cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), + Index(1), reducer, num_coeffs_to_preserve, rNumGroups) + .wait(); + self.device().deallocate_temp(temp_pointer); + } else { + dev.template unary_kernel_launcher( + self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, + num_coeffs_to_reduce) + .wait(); + } + return false; + } +}; +} // namespace internal +} // namespace TensorSycl + +namespace internal { + +template +struct FullReducer { + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1; + static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) { + typedef std::conditional_t OutType; + static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + + typename Self::Index inputSize = self.impl().dimensions().TotalSize(); + // In this step we force the code not to be more than 2-step reduction: + // Our empirical research shows that if each thread reduces at least 512 + // elements individually, we get better performance. + const Index reductionPerThread = 2048; + // const Index num_work_group = + Index reductionGroup = dev.getPowerOfTwo( + (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true); + const Index num_work_group = std::min(reductionGroup, local_range); + // 1 + // ? local_range + // : 1); + const Index global_range = num_work_group * local_range; + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + typedef TensorSycl::internal::FullReductionKernelFunctor reduction_kernel_t; + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType))); + typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); + dev.template unary_kernel_launcher(self, tmp_global_accessor, thread_range, + local_range, inputSize, reducer) + .wait(); + typedef TensorSycl::internal::SecondStepFullReducer + GenericRKernel; + dev.template unary_kernel_launcher( + tmp_global_accessor, data, + cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), + num_work_group, reducer) + .wait(); + dev.deallocate_temp(temp_pointer); + } else { + dev.template unary_kernel_launcher(self, data, thread_range, local_range, inputSize, + reducer) + .wait(); + } + } +}; +// vectorizable inner_most most dim preserver +// col reduction +template +struct OuterReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + return ::Eigen::TensorSycl::internal::PartialReducerLauncher< + Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output, + num_coeffs_to_reduce, + num_coeffs_to_preserve); + } +}; +// row reduction +template +struct InnerReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + return ::Eigen::TensorSycl::internal::PartialReducerLauncher< + Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output, + num_coeffs_to_reduce, + num_coeffs_to_preserve); + } +}; + +// ArmgMax uses this kernel for partial reduction// +// TODO(@mehdi.goli) come up with a better kernel +// generic partial reduction +template +struct GenericReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false; + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + typename Self::Index range, GRange, tileSize; + dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); + + dev.template unary_kernel_launcher>( + self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1), + reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast(1)) + .wait(); + return false; + } +}; + +} // namespace internal +} // namespace Eigen + +#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h new file mode 100644 index 00000000000..7061f512093 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -0,0 +1,317 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H +#define EIGEN_CXX11_TENSOR_TENSOR_REF_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +template +class TensorLazyBaseEvaluator { + public: + TensorLazyBaseEvaluator() : m_refcount(0) {} + virtual ~TensorLazyBaseEvaluator() {} + + EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0; + EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0; + + EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0; + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0; + + void incrRefCount() { ++m_refcount; } + void decrRefCount() { --m_refcount; } + int refCount() const { return m_refcount; } + + private: + // No copy, no assignment; + TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other); + TensorLazyBaseEvaluator& operator=(const TensorLazyBaseEvaluator& other); + + int m_refcount; +}; + +template +class TensorLazyEvaluatorReadOnly + : public TensorLazyBaseEvaluator::Scalar> { + public: + // typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Scalar Scalar; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef TensorEvaluator EvalType; + + TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) { + m_dims = m_impl.dimensions(); + m_impl.evalSubExprsIfNeeded(NULL); + } + virtual ~TensorLazyEvaluatorReadOnly() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const { return m_dims; } + EIGEN_DEVICE_FUNC virtual const Scalar* data() const { return m_impl.data(); } + + EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const { return m_impl.coeff(index); } + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) { + eigen_assert(false && "can't reference the coefficient of a rvalue"); + return m_dummy; + }; + + protected: + TensorEvaluator m_impl; + Dimensions m_dims; + Scalar m_dummy; +}; + +template +class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly { + public: + typedef TensorLazyEvaluatorReadOnly Base; + typedef typename Base::Scalar Scalar; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {} + virtual ~TensorLazyEvaluatorWritable() {} + + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) { return this->m_impl.coeffRef(index); } +}; + +template +class TensorLazyEvaluator : public std::conditional_t::value), + TensorLazyEvaluatorWritable, + TensorLazyEvaluatorReadOnly > { + public: + typedef std::conditional_t::value), + TensorLazyEvaluatorWritable, + TensorLazyEvaluatorReadOnly > + Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {} + virtual ~TensorLazyEvaluator() {} +}; + +} // namespace internal + +/** \class TensorRef + * \ingroup CXX11_Tensor_Module + * + * \brief A reference to a tensor expression + * The expression will be evaluated lazily (as much as possible). + * + */ +template +class TensorRef : public TensorBase > { + public: + typedef TensorRef Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + static constexpr Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + static constexpr int Layout = PlainObjectType::Layout; + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {} + + template + EIGEN_STRONG_INLINE TensorRef(const Expression& expr) + : m_evaluator(new internal::TensorLazyEvaluator(expr, DefaultDevice())) { + m_evaluator->incrRefCount(); + } + + template + EIGEN_STRONG_INLINE TensorRef& operator=(const Expression& expr) { + unrefEvaluator(); + m_evaluator = new internal::TensorLazyEvaluator(expr, DefaultDevice()); + m_evaluator->incrRefCount(); + return *this; + } + + ~TensorRef() { unrefEvaluator(); } + + TensorRef(const TensorRef& other) : TensorBase >(other), m_evaluator(other.m_evaluator) { + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + + TensorRef& operator=(const TensorRef& other) { + if (this != &other) { + unrefEvaluator(); + m_evaluator = other.m_evaluator; + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(Index index) const { return m_evaluator->coeff(index); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const { + const std::size_t num_indices = (sizeof...(otherIndices) + 1); + const array indices{{firstIndex, otherIndices...}}; + return coeff(indices); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) { + const std::size_t num_indices = (sizeof...(otherIndices) + 1); + const array indices{{firstIndex, otherIndices...}}; + return coeffRef(indices); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(const array& indices) const { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (size_t i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices - 1]; + for (int i = NumIndices - 2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeff(index); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (size_t i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices - 1]; + for (int i = NumIndices - 2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const { return m_evaluator->coeff(index); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_evaluator->coeffRef(index); } + + private: + EIGEN_STRONG_INLINE void unrefEvaluator() { + if (m_evaluator) { + m_evaluator->decrRefCount(); + if (m_evaluator->refCount() == 0) { + delete m_evaluator; + } + } + } + + internal::TensorLazyBaseEvaluator* m_evaluator; +}; + +// evaluator for rvalues +template +struct TensorEvaluator, Device> { + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorRef::Layout; + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) : m_ref(m) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_ref.coeff(index); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_ref.coeffRef(index); } + + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); } + + protected: + TensorRef m_ref; +}; + +// evaluator for lvalues +template +struct TensorEvaluator, Device> : public TensorEvaluator, Device> { + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + typedef TensorEvaluator, Device> Base; + + enum { IsAligned = false, PacketAccess = false, BlockAccess = false, PreferBlockAccess = false, RawAccess = false }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(TensorRef& m, const Device& d) : Base(m, d) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return this->m_ref.coeffRef(index); } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h new file mode 100644 index 00000000000..a81a2babfda --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -0,0 +1,410 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Navdeep Jaitly +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorReverse + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reverse elements class. + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorReverseOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorReverseOp type; +}; + +} // end namespace internal + +template +class TensorReverseOp : public TensorBase, WriteAccessors> { + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr, const ReverseDimensions& reverse_dims) + : m_xpr(expr), m_reverse_dims(reverse_dims) {} + + EIGEN_DEVICE_FUNC const ReverseDimensions& reverse() const { return m_reverse_dims; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp) + + protected: + typename XprType::Nested m_xpr; + const ReverseDimensions m_reverse_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorReverseOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = NumDims > 0, + PreferBlockAccess = true, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef internal::TensorIntDivisor IndexDivisor; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reverse(op.reverse()), m_device(device) { + // Reversing a scalar isn't supported yet. It would be a no-op anyway. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Compute strides + m_dimensions = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]); + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(Index index) const { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + Index idx = index / m_fastStrides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i]; + } + if (m_reverse[0]) { + inputIndex += (m_dimensions[0] - index - 1); + } else { + inputIndex += index; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + Index idx = index / m_fastStrides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i]; + } + if (m_reverse[NumDims - 1]) { + inputIndex += (m_dimensions[NumDims - 1] - index - 1); + } else { + inputIndex += index; + } + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_impl.coeff(reverseIndex(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + // TODO(ndjaitly): write a better packing routine that uses + // local structure. + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + // Block evaluation reads underlying memory in reverse order, and default + // cost model does not properly catch this in bytes stored/loaded. + return internal::TensorBlockResourceRequirements::skewed(target_size).addCostPerCoeff({0, 0, 24}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // TODO(ezhulenev): If underlying tensor expression supports and prefers + // block evaluation we must use it. Currently we use coeff and packet + // access into the underlying tensor expression. + // static const bool useBlockAccessForArgType = + // TensorEvaluator::BlockAccess && + // TensorEvaluator::PreferBlockAccess; + + static const bool isColMajor = static_cast(Layout) == static_cast(ColMajor); + + static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1; + const bool inner_dim_reversed = m_reverse[inner_dim_idx]; + + // Offset in the output block. + Index block_offset = 0; + + // Offset in the input Tensor. + Index input_offset = reverseIndex(desc.offset()); + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].count = 0; + it[i].reverse = m_reverse[dim]; + + it[i].block_stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride); + it[i].block_span = it[i].block_stride * (it[i].size - 1); + + it[i].input_stride = m_strides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + if (it[i].reverse) { + it[i].input_stride = -1 * it[i].input_stride; + it[i].input_span = -1 * it[i].input_span; + } + } + + // If multiple inner dimensions have the same reverse flag, check if we can + // merge them into a single virtual inner dimension. + int effective_inner_dim = 0; + for (int i = 1; i < NumDims; ++i) { + if (it[i].reverse != it[effective_inner_dim].reverse) break; + if (it[i].block_stride != it[effective_inner_dim].size) break; + if (it[i].block_stride != numext::abs(it[i].input_stride)) break; + + it[i].size = it[effective_inner_dim].size * it[i].size; + + it[i].block_stride = 1; + it[i].input_stride = (inner_dim_reversed ? -1 : 1); + + it[i].block_span = it[i].block_stride * (it[i].size - 1); + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + effective_inner_dim = i; + } + + eigen_assert(it[effective_inner_dim].block_stride == 1); + eigen_assert(it[effective_inner_dim].input_stride == (inner_dim_reversed ? -1 : 1)); + + const Index inner_dim_size = it[effective_inner_dim].size; + + // Prepare storage for the materialized reverse result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + CoeffReturnType* block_buffer = block_storage.data(); + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + // Copy inner-most dimension data from reversed location in input. + Index dst = block_offset; + Index src = input_offset; + + // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed + // worse results in benchmarks than a simple coefficient loop. + if (inner_dim_reversed) { + for (Index i = 0; i < inner_dim_size; ++i) { + block_buffer[dst] = m_impl.coeff(src); + ++dst; + --src; + } + } else { + for (Index i = 0; i < inner_dim_size; ++i) { + block_buffer[dst] = m_impl.coeff(src); + ++dst; + ++src; + } + } + + // For the 1d tensor we need to generate only one inner-most dimension. + if ((NumDims - effective_inner_dim) == 1) break; + + // Update offset. + for (Index i = effective_inner_dim + 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + block_offset += it[i].block_stride; + input_offset += it[i].input_stride; + break; + } + if (i != NumDims - 1) it[i].count = 0; + block_offset -= it[i].block_span; + input_offset -= it[i].input_span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + for (int i = 0; i < NumDims; ++i) { + if (m_reverse[i]) { + compute_cost += 2 * TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_strides; + array m_fastStrides; + TensorEvaluator m_impl; + ReverseDimensions m_reverse; + const Device EIGEN_DEVICE_REF m_device; + + private: + struct BlockIteratorState { + BlockIteratorState() + : size(0), count(0), reverse(false), block_stride(0), block_span(0), input_stride(0), input_span(0) {} + + Index size; + Index count; + bool reverse; + Index block_stride; + Index block_span; + Index input_stride; + Index input_span; + }; +}; + +// Eval as lvalue + +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorReverseOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const { + return this->m_impl.coeffRef(this->reverseIndex(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + // This code is pilfered from TensorMorphing.h + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index + i) = values[i]; + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h new file mode 100644 index 00000000000..d5b203ab619 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h @@ -0,0 +1,361 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Tobias Wood tobias@spinicist.org.uk +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ROLL_H +#define EIGEN_CXX11_TENSOR_TENSOR_ROLL_H +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorRoll + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor roll (circular shift) elements class. + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorRollOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorRollOp type; +}; + +} // end namespace internal + +template +class TensorRollOp : public TensorBase, WriteAccessors> { + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorRollOp(const XprType& expr, const RollDimensions& roll_dims) + : m_xpr(expr), m_roll_dims(roll_dims) {} + + EIGEN_DEVICE_FUNC const RollDimensions& roll() const { return m_roll_dims; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorRollOp) + + protected: + typename XprType::Nested m_xpr; + const RollDimensions m_roll_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorRollOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = NumDims > 0, + PreferBlockAccess = true, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef internal::TensorIntDivisor IndexDivisor; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + using TensorBlockDesc = internal::TensorBlockDescriptor; + using TensorBlockScratch = internal::TensorBlockScratchAllocator; + using ArgTensorBlock = typename TensorEvaluator::TensorBlock; + using TensorBlock = typename internal::TensorMaterializedBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_rolls(op.roll()), m_device(device) { + EIGEN_STATIC_ASSERT((NumDims > 0), Must_Have_At_Least_One_Dimension_To_Roll); + + // Compute strides + m_dimensions = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] > 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] > 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(nullptr); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index roll(Index const i, Index const r, Index const n) const { + auto const tmp = (i + r) % n; + if (tmp < 0) { + return tmp + n; + } else { + return tmp; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array rollCoords(array const& coords) const { + array rolledCoords; + for (int id = 0; id < NumDims; id++) { + eigen_assert(coords[id] < m_dimensions[id]); + rolledCoords[id] = roll(coords[id], m_rolls[id], m_dimensions[id]); + } + return rolledCoords; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rollIndex(Index index) const { + eigen_assert(index < dimensions().TotalSize()); + Index rolledIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + rolledIndex += roll(idx, m_rolls[i], m_dimensions[i]) * m_strides[i]; + } + rolledIndex += roll(index, m_rolls[0], m_dimensions[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + rolledIndex += roll(idx, m_rolls[i], m_dimensions[i]) * m_strides[i]; + } + rolledIndex += roll(index, m_rolls[NumDims - 1], m_dimensions[NumDims - 1]); + } + return rolledIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_impl.coeff(rollIndex(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::skewed(target_size).addCostPerCoeff({0, 0, 24}); + } + + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + static const bool is_col_major = static_cast(Layout) == static_cast(ColMajor); + + // Compute spatial coordinates for the first block element. + array coords; + extract_coordinates(desc.offset(), coords); + array initial_coords = coords; + Index offset = 0; // Offset in the output block buffer. + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims; ++i) { + const int dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride); + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + eigen_assert(it[0].stride == 1); + + // Prepare storage for the materialized generator result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + CoeffReturnType* block_buffer = block_storage.data(); + + static const int inner_dim = is_col_major ? 0 : NumDims - 1; + const Index inner_dim_size = it[0].size; + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + Index i = 0; + for (; i < inner_dim_size; ++i) { + auto const rolled = rollCoords(coords); + auto const index = is_col_major ? m_dimensions.IndexOfColMajor(rolled) : m_dimensions.IndexOfRowMajor(rolled); + *(block_buffer + offset + i) = m_impl.coeff(index); + coords[inner_dim]++; + } + coords[inner_dim] = initial_coords[inner_dim]; + + if (NumDims == 1) break; // For the 1d tensor we need to generate only one inner-most dimension. + + // Update offset. + for (i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + for (int i = 0; i < NumDims; ++i) { + compute_cost += 2 * TensorOpCost::AddCost(); + } + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return nullptr; } + + protected: + Dimensions m_dimensions; + array m_strides; + array m_fast_strides; + TensorEvaluator m_impl; + RollDimensions m_rolls; + const Device EIGEN_DEVICE_REF m_device; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array& coords) const { + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[NumDims - 1] = index; + } + } + + private: +}; + +// Eval as lvalue + +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorRollOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, + RawAccess = false + }; + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const { + return this->m_impl.coeffRef(this->rollIndex(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index + i) = values[i]; + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ROLL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h new file mode 100644 index 00000000000..d16d1490c8a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -0,0 +1,474 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Igor Babuschkin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H +#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorScanOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorScanOp type; +}; +} // end namespace internal + +/** \class TensorScan + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor scan class. + */ +template +class TensorScanOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(const XprType& expr, const Index& axis, bool exclusive = false, + const Op& op = Op()) + : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const { return m_axis; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const { return m_accumulator; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { return m_exclusive; } + + protected: + typename XprType::Nested m_expr; + const Index m_axis; + const Op m_accumulator; + const bool m_exclusive; +}; + +namespace internal { + +template +EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset, typename Self::CoeffReturnType* data) { + // Compute the scan along the axis, starting at the given offset + typename Self::CoeffReturnType accum = self.accumulator().initialize(); + if (self.stride() == 1) { + if (self.exclusive()) { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } + } else { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } else { + if (self.exclusive()) { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + Index curr = offset + idx3 * self.stride(); + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } + } else { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + Index curr = offset + idx3 * self.stride(); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } +} + +template +EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset, typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + using Packet = typename Self::PacketReturnType; + // Compute the scan along the axis, starting at the calculated offset + Packet accum = self.accumulator().template initializePacket(); + if (self.stride() == 1) { + if (self.exclusive()) { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + } + } else { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + } + } + } else { + if (self.exclusive()) { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + const Index curr = offset + idx3 * self.stride(); + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + } + } else { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + const Index curr = offset + idx3 * self.stride(); + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + } + } + } +} + +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) { + for (Index idx2 = 0; idx2 < self.stride(); idx2++) { + // Calculate the starting offset for the scan + Index offset = idx1 + idx2; + ReduceScalar(self, offset, data); + } + } +}; + +// Specialization for vectorized reduction. +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) { + using Packet = typename Self::PacketReturnType; + const int PacketSize = internal::unpacket_traits::size; + Index idx2 = 0; + for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) { + // Calculate the starting offset for the packet scan + Index offset = idx1 + idx2; + ReducePacket(self, offset, data); + } + for (; idx2 < self.stride(); idx2++) { + // Calculate the starting offset for the scan + Index offset = idx1 + idx2; + ReduceScalar(self, offset, data); + } + } +}; + +// Single-threaded CPU implementation of scan +template ::PacketAccess && + internal::reducer_traits::PacketAccess)> +struct ScanLauncher { + void operator()(Self& self, typename Self::CoeffReturnType* data) const { + Index total_size = internal::array_prod(self.dimensions()); + + // We fix the index along the scan axis to 0 and perform a + // scan per remaining entry. The iteration is split into two nested + // loops to avoid an integer division by keeping track of each idx1 and + // idx2. + for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { + ReduceBlock block_reducer; + block_reducer(self, idx1, data); + } + } +}; + +#ifdef EIGEN_USE_THREADS + +// Adjust block_size to avoid false sharing of cachelines among +// threads. Currently set to twice the cache line size on Intel and ARM +// processors. +EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) { + EIGEN_CONSTEXPR Index kBlockAlignment = 128; + const Index items_per_cacheline = numext::maxi(1, kBlockAlignment / item_size); + return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline); +} + +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + using Packet = typename Self::PacketReturnType; + const int PacketSize = internal::unpacket_traits::size; + Index num_scalars = self.stride(); + Index num_packets = 0; + if (self.stride() >= PacketSize) { + num_packets = self.stride() / PacketSize; + self.device().parallelFor( + num_packets, + TensorOpCost(PacketSize * self.size(), PacketSize * self.size(), 16 * PacketSize * self.size(), true, + PacketSize), + // Make the shard size large enough that two neighboring threads + // won't write to the same cacheline of `data`. + [=](Index blk_size) { return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size); }, + [&](Index first, Index last) { + for (Index packet = first; packet < last; ++packet) { + const Index idx2 = packet * PacketSize; + ReducePacket(self, idx1 + idx2, data); + } + }); + num_scalars -= num_packets * PacketSize; + } + self.device().parallelFor( + num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()), + // Make the shard size large enough that two neighboring threads + // won't write to the same cacheline of `data`. + [=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); }, + [&](Index first, Index last) { + for (Index scalar = first; scalar < last; ++scalar) { + const Index idx2 = num_packets * PacketSize + scalar; + ReduceScalar(self, idx1 + idx2, data); + } + }); + } +}; + +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + self.device().parallelFor( + self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()), + // Make the shard size large enough that two neighboring threads + // won't write to the same cacheline of `data`. + [=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); }, + [&](Index first, Index last) { + for (Index idx2 = first; idx2 < last; ++idx2) { + ReduceScalar(self, idx1 + idx2, data); + } + }); + } +}; + +// Specialization for multi-threaded execution. +template +struct ScanLauncher { + void operator()(Self& self, typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + using Packet = typename Self::PacketReturnType; + const int PacketSize = internal::unpacket_traits::size; + const Index total_size = internal::array_prod(self.dimensions()); + const Index inner_block_size = self.stride() * self.size(); + bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size)); + + if ((parallelize_by_outer_blocks && total_size <= 4096) || + (!parallelize_by_outer_blocks && self.stride() < PacketSize)) { + ScanLauncher launcher; + launcher(self, data); + return; + } + + if (parallelize_by_outer_blocks) { + // Parallelize over outer blocks. + const Index num_outer_blocks = total_size / inner_block_size; + self.device().parallelFor( + num_outer_blocks, + TensorOpCost(inner_block_size, inner_block_size, 16 * PacketSize * inner_block_size, Vectorize, PacketSize), + [=](Index blk_size) { return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size); }, + [&](Index first, Index last) { + for (Index idx1 = first; idx1 < last; ++idx1) { + ReduceBlock block_reducer; + block_reducer(self, idx1 * inner_block_size, data); + } + }); + } else { + // Parallelize over inner packets/scalars dimensions when the reduction + // axis is not an inner dimension. + ReduceBlock block_reducer; + for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { + block_reducer(self, idx1, data); + } + } + } +}; +#endif // EIGEN_USE_THREADS + +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + +// GPU implementation of scan +// TODO(ibab) This placeholder implementation performs multiple scans in +// parallel, but it would be better to use a parallel scan algorithm and +// optimize memory access. +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, + typename Self::CoeffReturnType* data) { + // Compute offset as in the CPU version + Index val = threadIdx.x + blockIdx.x * blockDim.x; + Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); + + if (offset + (self.size() - 1) * self.stride() < total_size) { + // Compute the scan along the axis, starting at the calculated offset + typename Self::CoeffReturnType accum = self.accumulator().initialize(); + for (Index idx = 0; idx < self.size(); idx++) { + Index curr = offset + idx * self.stride(); + if (self.exclusive()) { + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } else { + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } + __syncthreads(); +} + +template +struct ScanLauncher { + void operator()(const Self& self, typename Self::CoeffReturnType* data) { + Index total_size = internal::array_prod(self.dimensions()); + Index num_blocks = (total_size / self.size() + 63) / 64; + Index block_size = 64; + + LAUNCH_GPU_KERNEL((ScanKernel), num_blocks, block_size, 0, self.device(), self, total_size, data); + } +}; +#endif // EIGEN_USE_GPU && (EIGEN_GPUCC) + +} // namespace internal + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorScanOp XprType; + typedef typename XprType::Index Index; + typedef const ArgType ChildTypeNoConst; + typedef const ArgType ChildType; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef std::remove_const_t Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef TensorEvaluator, Device> Self; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), + m_device(device), + m_exclusive(op.exclusive()), + m_accumulator(op.accumulator()), + m_size(m_impl.dimensions()[op.axis()]), + m_stride(1), + m_consume_dim(op.axis()), + m_output(NULL) { + // Accumulating a scalar isn't supported. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(op.axis() >= 0 && op.axis() < NumDims); + + // Compute stride of scan axis + const Dimensions& dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < op.axis(); ++i) { + m_stride = m_stride * dims[i]; + } + } else { + // dims can only be indexed through unsigned integers, + // so let's use an unsigned type to let the compiler knows. + // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized + // in this function" + unsigned int axis = internal::convert_index(op.axis()); + for (unsigned int i = NumDims - 1; i > axis; --i) { + m_stride = m_stride * dims[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { return m_stride; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const { return m_consume_dim; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { return m_size; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { return m_accumulator; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { return m_exclusive; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& inner() const { return m_impl; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + internal::ScanLauncher launcher; + if (data) { + launcher(*this, data); + return false; + } + + const Index total_size = internal::array_prod(dimensions()); + m_output = + static_cast(m_device.get((Scalar*)m_device.allocate_temp(total_size * sizeof(Scalar)))); + launcher(*this, m_output); + return true; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_output + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_output; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_output[index]; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_output) { + m_device.deallocate_temp(m_output); + m_output = NULL; + } + m_impl.cleanup(); + } + + protected: + TensorEvaluator m_impl; + const Device EIGEN_DEVICE_REF m_device; + const bool m_exclusive; + Op m_accumulator; + const Index m_size; + Index m_stride; + Index m_consume_dim; + EvaluatorPointerType m_output; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h new file mode 100644 index 00000000000..30fde91970a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h @@ -0,0 +1,506 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorScanSycl.h + * + * \brief: + * Tensor Scan Sycl implement the extend version of + * "Efficient parallel scan algorithms for GPUs." .for Tensor operations. + * The algorithm requires up to 3 stage (consequently 3 kernels) depending on + * the size of the tensor. In the first kernel (ScanKernelFunctor), each + * threads within the work-group individually reduces the allocated elements per + * thread in order to reduces the total number of blocks. In the next step all + * thread within the work-group will reduce the associated blocks into the + * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary + * buffer is given as an input and all the threads within a work-group scan and + * reduces the boundaries between the blocks (generated from the previous + * kernel). and write the data on the temporary buffer. If the second kernel is + * required, the third and final kernel (ScanAdjustmentKernelFunctor) will + * adjust the final result into the output buffer. + * The original algorithm for the parallel prefix sum can be found here: + * + * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel + * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003 + *1, no. 1 (2008): 1-17. + *****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE +#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4) +#endif + +template +struct ScanParameters { + // must be power of 2 + static EIGEN_CONSTEXPR index_t ScanPerThread = 8; + const index_t total_size; + const index_t non_scan_size; + const index_t scan_size; + const index_t non_scan_stride; + const index_t scan_stride; + const index_t panel_threads; + const index_t group_threads; + const index_t block_threads; + const index_t elements_per_group; + const index_t elements_per_block; + const index_t loop_range; + + ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, + index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, + index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_) + : total_size(total_size_), + non_scan_size(non_scan_size_), + scan_size(scan_size_), + non_scan_stride(non_scan_stride_), + scan_stride(scan_stride_), + panel_threads(panel_threads_), + group_threads(group_threads_), + block_threads(block_threads_), + elements_per_group(elements_per_group_), + elements_per_block(elements_per_block_), + loop_range(loop_range_) {} +}; + +enum class scan_step { first, second }; +template +struct ScanKernelFunctor { + typedef cl::sycl::accessor + LocalAccessor; + static EIGEN_CONSTEXPR int PacketSize = ScanParameters::ScanPerThread / 2; + + LocalAccessor scratch; + Evaluator dev_eval; + OutAccessor out_ptr; + OutAccessor tmp_ptr; + const ScanParameters scanParameters; + Op accumulator; + const bool inclusive; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, + OutAccessor out_accessor_, OutAccessor temp_accessor_, + const ScanParameters scanParameters_, Op accumulator_, + const bool inclusive_) + : scratch(scratch_), + dev_eval(dev_eval_), + out_ptr(out_accessor_), + tmp_ptr(temp_accessor_), + scanParameters(scanParameters_), + accumulator(accumulator_), + inclusive(inclusive_) {} + + template + std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read( + const Input &inpt, Index global_id) const { + return inpt.coeff(global_id); + } + + template + std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read( + const Input &inpt, Index global_id) const { + return inpt[global_id]; + } + + template + std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation( + InclusiveOp inclusive_op) const { + inclusive_op(); + } + + template + std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation( + InclusiveOp) const {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { + Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); + Index tmp = data_offset % scanParameters.panel_threads; + const Index panel_id = data_offset / scanParameters.panel_threads; + const Index group_id = tmp / scanParameters.group_threads; + tmp = tmp % scanParameters.group_threads; + const Index block_id = tmp / scanParameters.block_threads; + const Index local_id = tmp % scanParameters.block_threads; + // we put one element per packet in scratch_mem + const Index scratch_stride = scanParameters.elements_per_block / PacketSize; + const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride; + CoeffReturnType private_scan[ScanParameters::ScanPerThread]; + CoeffReturnType inclusive_scan; + // the actual panel size is scan_size * non_scan_size. + // elements_per_panel is roundup to power of 2 for binary tree + const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; + const Index group_offset = group_id * scanParameters.non_scan_stride; + // This will be effective when the size is bigger than elements_per_block + const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; + const Index thread_offset = (ScanParameters::ScanPerThread * local_id * scanParameters.scan_stride); + const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; + Index next_elements = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + private_scan[i] = ((((block_id * scanParameters.elements_per_block) + + (ScanParameters::ScanPerThread * local_id) + i) < scanParameters.scan_size) && + (global_id < scanParameters.total_size)) + ? read(dev_eval, global_id) + : accumulator.initialize(); + next_elements += scanParameters.scan_stride; + } + first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { + if (inclusive) { + inclusive_scan = private_scan[ScanParameters::ScanPerThread - 1]; + } + }); + // This for loop must be 2 + EIGEN_UNROLL_LOOP + for (int packetIndex = 0; packetIndex < ScanParameters::ScanPerThread; packetIndex += PacketSize) { + Index private_offset = 1; + // build sum in place up the tree + EIGEN_UNROLL_LOOP + for (Index d = PacketSize >> 1; d > 0; d >>= 1) { + EIGEN_UNROLL_LOOP + for (Index l = 0; l < d; l++) { + Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; + Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(private_scan[ai], &accum); + accumulator.reduce(private_scan[bi], &accum); + private_scan[bi] = accumulator.finalize(accum); + } + private_offset *= 2; + } + scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] = + private_scan[PacketSize - 1 + packetIndex]; + private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize(); + // traverse down tree & build scan + EIGEN_UNROLL_LOOP + for (Index d = 1; d < PacketSize; d *= 2) { + private_offset >>= 1; + EIGEN_UNROLL_LOOP + for (Index l = 0; l < d; l++) { + Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; + Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(private_scan[ai], &accum); + accumulator.reduce(private_scan[bi], &accum); + private_scan[ai] = private_scan[bi]; + private_scan[bi] = accumulator.finalize(accum); + } + } + } + + Index offset = 1; + // build sum in place up the tree + for (Index d = scratch_stride >> 1; d > 0; d >>= 1) { + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (local_id < d) { + Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; + Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(scratch[ai], &accum); + accumulator.reduce(scratch[bi], &accum); + scratch[bi] = accumulator.finalize(accum); + } + offset *= 2; + } + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // next step optimisation + if (local_id == 0) { + if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) { + const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) * + scanParameters.non_scan_size + + group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) + + block_id; + tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset]; + } + // clear the last element + scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize(); + } + // traverse down tree & build scan + for (Index d = 1; d < scratch_stride; d *= 2) { + offset >>= 1; + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (local_id < d) { + Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; + Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(scratch[ai], &accum); + accumulator.reduce(scratch[bi], &accum); + scratch[ai] = scratch[bi]; + scratch[bi] = accumulator.finalize(accum); + } + } + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // This for loop must be 2 + EIGEN_UNROLL_LOOP + for (int packetIndex = 0; packetIndex < ScanParameters::ScanPerThread; packetIndex += PacketSize) { + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; i++) { + CoeffReturnType accum = private_scan[packetIndex + i]; + accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum); + private_scan[packetIndex + i] = accumulator.finalize(accum); + } + } + first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { + if (inclusive) { + accumulator.reduce(private_scan[ScanParameters::ScanPerThread - 1], &inclusive_scan); + private_scan[0] = accumulator.finalize(inclusive_scan); + } + }); + next_elements = 0; + // right the first set of private param + EIGEN_UNROLL_LOOP + for (Index i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + if ((((block_id * scanParameters.elements_per_block) + (ScanParameters::ScanPerThread * local_id) + i) < + scanParameters.scan_size) && + (global_id < scanParameters.total_size)) { + Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters::ScanPerThread) * (inclusive)); + out_ptr[global_id] = private_scan[private_id]; + } + next_elements += scanParameters.scan_stride; + } + } // end for loop + } +}; + +template +struct ScanAdjustmentKernelFunctor { + typedef cl::sycl::accessor + LocalAccessor; + static EIGEN_CONSTEXPR int PacketSize = ScanParameters::ScanPerThread / 2; + InAccessor in_ptr; + OutAccessor out_ptr; + const ScanParameters scanParameters; + Op accumulator; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, + OutAccessor out_accessor_, + const ScanParameters scanParameters_, + Op accumulator_) + : in_ptr(in_accessor_), out_ptr(out_accessor_), scanParameters(scanParameters_), accumulator(accumulator_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { + Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); + Index tmp = data_offset % scanParameters.panel_threads; + const Index panel_id = data_offset / scanParameters.panel_threads; + const Index group_id = tmp / scanParameters.group_threads; + tmp = tmp % scanParameters.group_threads; + const Index block_id = tmp / scanParameters.block_threads; + const Index local_id = tmp % scanParameters.block_threads; + + // the actual panel size is scan_size * non_scan_size. + // elements_per_panel is roundup to power of 2 for binary tree + const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; + const Index group_offset = group_id * scanParameters.non_scan_stride; + // This will be effective when the size is bigger than elements_per_block + const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; + const Index thread_offset = ScanParameters::ScanPerThread * local_id * scanParameters.scan_stride; + + const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; + const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block; + const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id; + CoeffReturnType adjust_val = in_ptr[in_id]; + + Index next_elements = 0; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + if ((((block_id * scanParameters.elements_per_block) + (ScanParameters::ScanPerThread * local_id) + i) < + scanParameters.scan_size) && + (global_id < scanParameters.total_size)) { + CoeffReturnType accum = adjust_val; + accumulator.reduce(out_ptr[global_id], &accum); + out_ptr[global_id] = accumulator.finalize(accum); + } + next_elements += scanParameters.scan_stride; + } + } + } +}; + +template +struct ScanInfo { + const Index &total_size; + const Index &scan_size; + const Index &panel_size; + const Index &non_scan_size; + const Index &scan_stride; + const Index &non_scan_stride; + + Index max_elements_per_block; + Index block_size; + Index panel_threads; + Index group_threads; + Index block_threads; + Index elements_per_group; + Index elements_per_block; + Index loop_range; + Index global_range; + Index local_range; + const Eigen::SyclDevice &dev; + EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, + const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, + const Eigen::SyclDevice &dev_) + : total_size(total_size_), + scan_size(scan_size_), + panel_size(panel_size_), + non_scan_size(non_scan_size_), + scan_stride(scan_stride_), + non_scan_stride(non_scan_stride_), + dev(dev_) { + // must be power of 2 + local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()), + Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1)); + + max_elements_per_block = local_range * ScanParameters::ScanPerThread; + + elements_per_group = + dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters::ScanPerThread)), true); + const Index elements_per_panel = elements_per_group * non_scan_size; + elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block)); + panel_threads = elements_per_panel / ScanParameters::ScanPerThread; + group_threads = elements_per_group / ScanParameters::ScanPerThread; + block_threads = elements_per_block / ScanParameters::ScanPerThread; + block_size = elements_per_group / elements_per_block; +#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE + const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE)); +#else + const Index max_threads = panel_threads * panel_size; +#endif + global_range = roundUp(max_threads, local_range); + loop_range = Index( + std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters::ScanPerThread))); + } + inline ScanParameters get_scan_parameter() { + return ScanParameters(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads, + group_threads, block_threads, elements_per_group, elements_per_block, loop_range); + } + inline cl::sycl::nd_range<1> get_thread_range() { + return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + } +}; + +template +struct SYCLAdjustBlockOffset { + EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, + Reducer &accumulator, const Index total_size, + const Index scan_size, const Index panel_size, + const Index non_scan_size, const Index scan_stride, + const Index non_scan_stride, const Eigen::SyclDevice &dev) { + auto scan_info = + ScanInfo(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); + + typedef ScanAdjustmentKernelFunctor + AdjustFuctor; + dev.template unary_kernel_launcher(in_ptr, out_ptr, scan_info.get_thread_range(), + scan_info.max_elements_per_block, + scan_info.get_scan_parameter(), accumulator) + .wait(); + } +}; + +template +struct ScanLauncher_impl { + template + EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, + const Index total_size, const Index scan_size, const Index panel_size, + const Index non_scan_size, const Index scan_stride, + const Index non_scan_stride, const bool inclusive, + const Eigen::SyclDevice &dev) { + auto scan_info = + ScanInfo(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); + const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size; + const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters::ScanPerThread / 2); + CoeffReturnType *temp_pointer = + static_cast(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); + + typedef ScanKernelFunctor ScanFunctor; + dev.template binary_kernel_launcher( + in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size, + scan_info.get_scan_parameter(), accumulator, inclusive) + .wait(); + + if (scan_info.block_size > 1) { + ScanLauncher_impl::scan_block( + tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size, + non_scan_size, Index(1), scan_info.block_size, false, dev); + + SYCLAdjustBlockOffset::adjust_scan_block_offset( + tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, + non_scan_stride, dev); + } + dev.deallocate_temp(temp_pointer); + } +}; + +} // namespace internal +} // namespace TensorSycl +namespace internal { +template +struct ScanLauncher { + typedef typename Self::Index Index; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::Storage Storage; + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + void operator()(Self &self, EvaluatorPointerType data) const { + const Index total_size = internal::array_prod(self.dimensions()); + const Index scan_size = self.size(); + const Index scan_stride = self.stride(); + // this is the scan op (can be sum or ...) + auto accumulator = self.accumulator(); + auto inclusive = !self.exclusive(); + auto consume_dim = self.consume_dim(); + auto dev = self.device(); + + auto dims = self.inner().dimensions(); + + Index non_scan_size = 1; + Index panel_size = 1; + if (static_cast(Self::Layout) == static_cast(ColMajor)) { + for (int i = 0; i < consume_dim; i++) { + non_scan_size *= dims[i]; + } + for (int i = consume_dim + 1; i < Self::NumDims; i++) { + panel_size *= dims[i]; + } + } else { + for (int i = Self::NumDims - 1; i > consume_dim; i--) { + non_scan_size *= dims[i]; + } + for (int i = consume_dim - 1; i >= 0; i--) { + panel_size *= dims[i]; + } + } + const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size; + auto eval_impl = self.inner(); + TensorSycl::internal::ScanLauncher_impl::scan_block( + eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, + inclusive, dev); + } +}; +} // namespace internal +} // namespace Eigen + +#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h new file mode 100644 index 00000000000..11ce0fb4e48 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -0,0 +1,415 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H +#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorShuffling + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor shuffling class. + * + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorShufflingOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorShufflingOp type; +}; + +} // end namespace internal + +template +class TensorShufflingOp : public TensorBase > { + public: + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl) + : m_xpr(expr), m_shuffle(shfl) {} + + EIGEN_DEVICE_FUNC const Shuffle& shufflePermutation() const { return m_shuffle; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp) + + protected: + typename XprType::Nested m_xpr; + const Shuffle m_shuffle; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Self; + typedef TensorShufflingOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_impl(op.expression(), device) { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Shuffle& shuffle = op.shufflePermutation(); + m_is_identity = true; + for (int i = 0; i < NumDims; ++i) { + m_shuffle[i] = static_cast(shuffle[i]); + m_dimensions[i] = input_dims[shuffle[i]]; + m_inverseShuffle[shuffle[i]] = i; + if (m_is_identity && shuffle[i] != i) { + m_is_identity = false; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_unshuffledInputStrides[0] = 1; + m_outputStrides[0] = 1; + + for (int i = 1; i < NumDims; ++i) { + m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = + internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); + } + } else { + m_unshuffledInputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = + internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } + } + + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType Run(const Self& self, Index index) { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + }; + + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType Run(const Self& self, Index index) { + if (self.m_is_identity) { + return self.m_impl.template packet(index); + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + }; + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + return PacketLoader::PacketAccess>::Run(*this, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const int inner_dim = Layout == static_cast(ColMajor) ? 0 : NumDims - 1; + + const size_t target_size = m_device.firstLevelCacheSize(); + const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim; + + // Shuffled inner dimensions leads to a random memory access, which is not + // captured by default cost model bytes loaded/stored. We add this cost + // explicitly. The number of cycles picked based on the benchmarks. + // TODO(ezhulenev): This number was picked based on a very questionable + // benchmarks, add benchmarks that are representative of real workloads. + using BlockRequirements = internal::TensorBlockResourceRequirements; + if (inner_dim_shuffled) { + return BlockRequirements::uniform(target_size).addCostPerCoeff({0, 0, NumDims * 28}); + } else { + return BlockRequirements::skewed(target_size); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + eigen_assert(m_impl.data() != NULL); + + typedef internal::TensorBlockIO TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch, /*allow_strided_storage=*/root_of_expr_ast); + + typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides); + TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset())); + + TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(), block_storage.data()); + + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle); + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = m_is_identity + ? TensorOpCost::AddCost() + : NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index + GetBlockOutputIndex(Index input_index, const DSizes& input_block_strides, + const DSizes& output_block_strides, + const DSizes, NumDims>& fast_input_block_strides) const { + Index output_index = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = input_index / fast_input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * output_block_strides[m_inverseShuffle[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = input_index / fast_input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * output_block_strides[m_inverseShuffle[NumDims - 1]]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[0]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[NumDims - 1]; + } + } + + Dimensions m_dimensions; + bool m_is_identity; + array m_shuffle; + array m_inverseShuffle; // TODO(ezhulenev): Make it int type. + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + array m_unshuffledInputStrides; + + const Device EIGEN_DEVICE_REF m_device; + TensorEvaluator m_impl; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + + typedef TensorShufflingOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + static constexpr int Layout = TensorEvaluator::Layout; + + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index + i) = values[i]; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + + typedef internal::TensorBlockIO TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + const Scalar* block_buffer = block.data(); + + // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen + // expression with coefficient and packet access as `src`. + void* mem = NULL; + if (block_buffer == NULL) { + mem = this->m_device.allocate(desc.size() * sizeof(Scalar)); + ScalarNoConst* buf = static_cast(mem); + + typedef internal::TensorBlockAssignment + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target(desc.dimensions(), internal::strides(desc.dimensions()), buf), + block.expr()); + + block_buffer = buf; + } + + // Read from block. + TensorBlockIOSrc src(internal::strides(desc.dimensions()), block_buffer); + + // Write to the output buffer. + typename TensorBlockIO::Dimensions output_strides(this->m_unshuffledInputStrides); + typename TensorBlockIO::Dimensions output_dimensions; + for (int i = 0; i < NumDims; ++i) { + output_dimensions[this->m_shuffle[i]] = desc.dimension(i); + } + TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(), this->srcCoeff(desc.offset())); + + // Reorder dimensions according to the shuffle. + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map; + for (int i = 0; i < NumDims; ++i) { + dst_to_src_dim_map[i] = static_cast(this->m_inverseShuffle[i]); + } + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + // Deallocate temporary buffer used for the block materialization. + if (mem != NULL) this->m_device.deallocate(mem); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h new file mode 100644 index 00000000000..964797613ae --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -0,0 +1,144 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// Copyright (C) 2014-2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H +#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H + +#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN +#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN; +#else +#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN +#endif + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \internal + * + * \class TensorStorage + * \ingroup CXX11_Tensor_Module + * + * \brief Stores the data of a tensor + * + * This class stores the data of fixed-size, dynamic-size or mixed tensors + * in a way as compact as possible. + * + * \sa Tensor + */ +template +class TensorStorage; + +// Pure fixed-size storage +template +class TensorStorage { + private: + static constexpr std::size_t Size = FixedDimensions::total_size; + + // Allocate an array of size at least one to prevent compiler warnings. + static constexpr std::size_t MinSize = max_n_1::size; + EIGEN_ALIGN_MAX T m_data[MinSize]; + + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStorage() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* data() { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* data() const { return m_data; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const FixedDimensions dimensions() const { return FixedDimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return Size; } +}; + +// pure dynamic +template +class TensorStorage, Options_> { + public: + typedef IndexType Index; + typedef DSizes Dimensions; + typedef TensorStorage, Options_> Self; + + EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() { + if (NumIndices_ == 0) { + m_data = internal::conditional_aligned_new_auto(1); + } + } + EIGEN_DEVICE_FUNC TensorStorage(Index size, const array& dimensions) + : m_data(internal::conditional_aligned_new_auto(size)), m_dimensions(dimensions) { + EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN + } + + template + EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) { + m_data = internal::conditional_aligned_new_auto(internal::array_prod(m_dimensions)); + } + + EIGEN_DEVICE_FUNC TensorStorage(const Self& other) + : m_data(internal::conditional_aligned_new_auto( + internal::array_prod(other.m_dimensions))), + m_dimensions(other.m_dimensions) { + internal::smart_copy(other.m_data, other.m_data + internal::array_prod(other.m_dimensions), m_data); + } + EIGEN_DEVICE_FUNC Self& operator=(const Self& other) { + if (this != &other) { + Self tmp(other); + this->swap(tmp); + } + return *this; + } + + EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage() { *this = std::move(other); } + + EIGEN_DEVICE_FUNC Self& operator=(Self&& other) { + numext::swap(m_data, other.m_data); + numext::swap(m_dimensions, other.m_dimensions); + return *this; + } + + EIGEN_DEVICE_FUNC ~TensorStorage() { + internal::conditional_aligned_delete_auto(m_data, + internal::array_prod(m_dimensions)); + } + EIGEN_DEVICE_FUNC void swap(Self& other) { + numext::swap(m_data, other.m_data); + numext::swap(m_dimensions, other.m_dimensions); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC void resize(Index size, const array& nbDimensions) { + const Index currentSz = internal::array_prod(m_dimensions); + if (size != currentSz) { + internal::conditional_aligned_delete_auto(m_data, currentSz); + if (size) + m_data = internal::conditional_aligned_new_auto(size); + else if (NumIndices_ == 0) { + m_data = internal::conditional_aligned_new_auto(1); + } else + m_data = 0; + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) + } + m_dimensions = nbDimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* data() { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* data() const { return m_data; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + + private: + T* m_data; + Dimensions m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h new file mode 100644 index 00000000000..2924abba062 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -0,0 +1,316 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorStriding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor striding class. + * + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorStridingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorStridingOp type; +}; + +} // end namespace internal + +template +class TensorStridingOp : public TensorBase > { + public: + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC const Strides& strides() const { return m_dims; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp) + + protected: + typename XprType::Nested m_xpr; + const Strides m_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorStridingOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = Eigen::numext::ceil(static_cast(m_dimensions[i]) / op.strides()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + m_inputStrides[i - 1] *= op.strides()[i - 1]; + } + m_inputStrides[NumDims - 1] *= op.strides()[NumDims - 1]; + } else { // RowMajor + m_outputStrides[NumDims - 1] = 1; + m_inputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + m_inputStrides[i + 1] *= op.strides()[i + 1]; + } + m_inputStrides[0] *= op.strides()[0]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_impl.coeff(srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + PacketSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + } else { // RowMajor + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[NumDims - 1]; + inputIndices[1] += indices[1] * m_inputStrides[NumDims - 1]; + } + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[PacketSize - 1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize - 1; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost() + TensorOpCost::MulCost() + + TensorOpCost::DivCost()) + + TensorOpCost::MulCost(); + if (vectorized) { + compute_cost *= 2; // packet() computes two indices + } + const int innerDim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : (NumDims - 1); + return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) + + // Computation is not vectorized per se, but it is done once per packet. + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[0]; + } else { // RowMajor + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[NumDims - 1]; + } + return inputIndex; + } + + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorStridingOp XprType; + typedef TensorEvaluator Base; + // typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + // typedef DSizes Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = TensorEvaluator::PacketAccess, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < this->dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + PacketSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[0]; + inputIndices[1] += indices[1] * this->m_inputStrides[0]; + } else { // RowMajor + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[NumDims - 1]; + inputIndices[1] += indices[1] * this->m_inputStrides[NumDims - 1]; + } + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + this->m_impl.template writePacket(inputIndices[0], x); + } else { + EIGEN_ALIGN_MAX Scalar values[PacketSize]; + internal::pstore(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize - 1]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize - 1; ++i) { + this->coeffRef(index + i) = values[i]; + } + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h new file mode 100644 index 00000000000..035d94e540f --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -0,0 +1,278 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gagan Goel +// Copyright (C) 2017 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorTrace + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor Trace class. + * + * + */ + +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions - array_size::value; + static constexpr int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorTraceOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorTraceOp type; +}; + +} // end namespace internal + +template +class TensorTraceOp : public TensorBase > { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t& expression() const { + return m_xpr; + } + + protected: + typename XprType::Nested m_xpr; + const Dims m_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorTraceOp XprType; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + static constexpr int NumReducedDims = internal::array_size::value; + static constexpr int NumOutputDims = NumInputDims - NumReducedDims; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_traceDim(1), m_device(device) { + EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + for (int i = 0; i < NumInputDims; ++i) { + m_reduced[i] = false; + } + + const Dims& op_dims = op.dims(); + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op_dims[i] >= 0); + eigen_assert(op_dims[i] < NumInputDims); + m_reduced[op_dims[i]] = true; + } + + // All the dimensions should be distinct to compute the trace + int num_distinct_reduce_dims = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + ++num_distinct_reduce_dims; + } + } + + EIGEN_ONLY_USED_FOR_DEBUG(num_distinct_reduce_dims); + eigen_assert(num_distinct_reduce_dims == NumReducedDims); + + // Compute the dimensions of the result. + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + int output_index = 0; + int reduced_index = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedDims[reduced_index] = input_dims[i]; + if (reduced_index > 0) { + // All the trace dimensions must have the same size + eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]); + } + ++reduced_index; + } else { + m_dimensions[output_index] = input_dims[i]; + ++output_index; + } + } + + if (NumReducedDims != 0) { + m_traceDim = m_reducedDims[0]; + } + + // Compute the output strides + if (NumOutputDims > 0) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_outputStrides.back() = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + } + + // Compute the input strides + if (NumInputDims > 0) { + array input_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i - 1] * input_dims[i - 1]; + } + } else { + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + output_index = 0; + reduced_index = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedStrides[reduced_index] = input_strides[i]; + ++reduced_index; + } else { + m_preservedStrides[output_index] = input_strides[i]; + ++output_index; + } + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + // Initialize the result + CoeffReturnType result = internal::cast(0); + Index index_stride = 0; + for (int i = 0; i < NumReducedDims; ++i) { + index_stride += m_reducedStrides[i]; + } + + // If trace is requested along all dimensions, starting index would be 0 + Index cur_index = 0; + if (NumOutputDims != 0) cur_index = firstInput(index); + for (Index i = 0; i < m_traceDim; ++i) { + result += m_impl.coeff(cur_index); + cur_index += index_stride; + } + + return result; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType result = internal::ploadt(values); + return result; + } + + protected: + // Given the output index, finds the first index in the input tensor used to compute the trace + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[0]; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[NumOutputDims - 1]; + } + return startInput; + } + + Dimensions m_dimensions; + TensorEvaluator m_impl; + // Initialize the size of the trace dimension + Index m_traceDim; + const Device EIGEN_DEVICE_REF m_device; + array m_reduced; + array m_reducedDims; + array m_outputStrides; + array m_reducedStrides; + array m_preservedStrides; +}; + +} // End namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h new file mode 100644 index 00000000000..017b4ff6340 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -0,0 +1,231 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +template +class compute_tensor_flags { + enum { + is_dynamic_size_storage = 1, + + is_aligned = (((Options & DontAlign) == 0) && ( +#if EIGEN_MAX_STATIC_ALIGN_BYTES > 0 + (!is_dynamic_size_storage) +#else + 0 +#endif + | +#if EIGEN_MAX_ALIGN_BYTES > 0 + is_dynamic_size_storage +#else + 0 +#endif + )), + packet_access_bit = packet_traits::Vectorizable && is_aligned ? PacketAccessBit : 0 + }; + + public: + enum { ret = packet_access_bit }; +}; + +template +struct traits > { + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef IndexType_ Index; + static constexpr int NumDimensions = NumIndices_; + static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret | (is_const::value ? 0 : LvalueBit) + }; + template + struct MakePointer { + typedef T* Type; + }; + typedef typename MakePointer::Type PointerType; +}; + +template +struct traits > { + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef IndexType_ Index; + static constexpr int NumDimensions = array_size::value; + static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret | (is_const::value ? 0 : LvalueBit) + }; + template + struct MakePointer { + typedef T* Type; + }; + typedef typename MakePointer::Type PointerType; +}; + +template class MakePointer_> +struct traits > : public traits { + typedef traits BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + static constexpr int NumDimensions = BaseTraits::NumDimensions; + static constexpr int Layout = BaseTraits::Layout; + enum { Options = Options_, Flags = BaseTraits::Flags }; + template + struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; + typedef typename MakePointer::Type PointerType; +}; + +template +struct traits > : public traits { + typedef traits BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + static constexpr int NumDimensions = BaseTraits::NumDimensions; + static constexpr int Layout = BaseTraits::Layout; + enum { Options = BaseTraits::Options, Flags = BaseTraits::Flags }; + typedef typename BaseTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const Tensor EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> { + typedef const Tensor EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorFixedSize EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorFixedSize EIGEN_DEVICE_REF type; +}; + +template class MakePointer> +struct eval, Eigen::Dense> { + typedef const TensorMap EIGEN_DEVICE_REF type; +}; + +template class MakePointer> +struct eval, Eigen::Dense> { + typedef const TensorMap EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorRef EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorRef EIGEN_DEVICE_REF type; +}; + +// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector. +template +struct nested { + typedef typename ref_selector::type type; +}; + +template +struct nested > { + typedef const Tensor EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef const Tensor EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef const TensorFixedSize EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef const TensorFixedSize EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef const TensorRef EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef const TensorRef EIGEN_DEVICE_REF type; +}; + +} // end namespace internal + +// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C, +// R, B), and convolve it with a set of filters, which can also be presented as +// a tensor (D, K, K, M), where M is the number of filters, K is the filter +// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For +// simplicity we assume that we always use square filters (which is usually the +// case in images), hence the two Ks in the tensor dimension. It also takes in +// a few additional parameters: +// Stride (S): The convolution stride is the offset between locations where we +// apply the filters. A larger stride means that the output will be +// spatially smaller. +// Padding (P): The padding we apply to the input tensor along the R and C +// dimensions. This is usually used to make sure that the spatial +// dimensions of the output matches our intention. +// +// Two types of padding are often used: +// SAME: The pad value is computed so that the output will have size +// R/S and C/S. +// VALID: no padding is carried out. +// When we do padding, the padded values at the padded locations are usually +// zero. +// +// The output dimensions for convolution, when given all the parameters above, +// are as follows: +// When Padding = SAME: the output size is (B, R', C', M), where +// R' = ceil(float(R) / float(S)) +// C' = ceil(float(C) / float(S)) +// where ceil is the ceiling function. The input tensor is padded with 0 as +// needed. The number of padded rows and columns are computed as: +// Pr = ((R' - 1) * S + K - R) / 2 +// Pc = ((C' - 1) * S + K - C) / 2 +// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2. +// This is where SAME comes from - the output has the same size as the input has. +// When Padding = VALID: the output size is computed as +// R' = ceil(float(R - K + 1) / float(S)) +// C' = ceil(float(C - K + 1) / float(S)) +// and the number of padded rows and columns are computed in the same way as in +// the SAME case. +// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0, +// Pc=0. +enum PaddingType { PADDING_VALID = 1, PADDING_SAME = 2 }; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h new file mode 100644 index 00000000000..99e51c57718 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -0,0 +1,229 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H +#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +template +struct static_val { + static const uint64_t value = n; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() {} + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { + EIGEN_UNUSED_VARIABLE(v); + eigen_assert(v == n); + } +}; + +template +struct TensorUInt128 { + HIGH high; + LOW low; + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(const TensorUInt128& other) + : high(other.high), low(other.low) { + EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128& operator=(const TensorUInt128& other) { + EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); + high = other.high; + low = other.low; + return *this; + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) { + eigen_assert( + (static_cast>(x) <= NumTraits::highest())); + eigen_assert(x >= 0); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(HIGH y, LOW x) : high(y), low(x) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const { return low; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const { return low; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const { return high; } +}; + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator==(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + return (lhs.high == rhs.high) && (lhs.low == rhs.low); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator!=(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + return (lhs.high != rhs.high) || (lhs.low != rhs.low); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator>=(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + if (lhs.high != rhs.high) { + return lhs.high > rhs.high; + } + return lhs.low >= rhs.low; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator<(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + if (lhs.high != rhs.high) { + return lhs.high < rhs.high; + } + return lhs.low < rhs.low; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128 operator+(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + TensorUInt128 result(lhs.high + rhs.high, lhs.low + rhs.low); + if (result.low < rhs.low) { + result.high += 1; + } + return result; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128 operator-(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + TensorUInt128 result(lhs.high - rhs.high, lhs.low - rhs.low); + if (result.low > lhs.low) { + result.high -= 1; + } + return result; +} + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorUInt128 operator*( + const TensorUInt128& lhs, const TensorUInt128& rhs) { + // Split each 128-bit integer into 4 32-bit integers, and then do the + // multiplications by hand as follow: + // lhs a b c d + // rhs e f g h + // ----------- + // ah bh ch dh + // bg cg dg + // cf df + // de + // The result is stored in 2 64bit integers, high and low. + + const uint64_t LOW = 0x00000000FFFFFFFFLL; + const uint64_t HIGH = 0xFFFFFFFF00000000LL; + + uint64_t d = lhs.low & LOW; + uint64_t c = (lhs.low & HIGH) >> 32LL; + uint64_t b = lhs.high & LOW; + uint64_t a = (lhs.high & HIGH) >> 32LL; + + uint64_t h = rhs.low & LOW; + uint64_t g = (rhs.low & HIGH) >> 32LL; + uint64_t f = rhs.high & LOW; + uint64_t e = (rhs.high & HIGH) >> 32LL; + + // Compute the low 32 bits of low + uint64_t acc = d * h; + uint64_t low = acc & LOW; + // Compute the high 32 bits of low. Add a carry every time we wrap around + acc >>= 32LL; + uint64_t carry = 0; + uint64_t acc2 = acc + c * h; + if (acc2 < acc) { + carry++; + } + acc = acc2 + d * g; + if (acc < acc2) { + carry++; + } + low |= (acc << 32LL); + + // Carry forward the high bits of acc to initiate the computation of the + // low 32 bits of high + acc2 = (acc >> 32LL) | (carry << 32LL); + carry = 0; + + acc = acc2 + b * h; + if (acc < acc2) { + carry++; + } + acc2 = acc + c * g; + if (acc2 < acc) { + carry++; + } + acc = acc2 + d * f; + if (acc < acc2) { + carry++; + } + uint64_t high = acc & LOW; + + // Start to compute the high 32 bits of high. + acc2 = (acc >> 32LL) | (carry << 32LL); + + acc = acc2 + a * h; + acc2 = acc + b * g; + acc = acc2 + c * f; + acc2 = acc + d * e; + high |= (acc2 << 32LL); + + return TensorUInt128(high, low); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorUInt128 operator/( + const TensorUInt128& lhs, const TensorUInt128& rhs) { + if (rhs == TensorUInt128, static_val<1>>(1)) { + return TensorUInt128(lhs.high, lhs.low); + } else if (lhs < rhs) { + return TensorUInt128(0); + } else { + // calculate the biggest power of 2 times rhs that's less than or equal to lhs + TensorUInt128 power2(1); + TensorUInt128 d(rhs); + TensorUInt128 tmp(lhs - d); + while (lhs >= d) { + tmp = tmp - d; + d = d + d; + power2 = power2 + power2; + } + + tmp = TensorUInt128(lhs.high, lhs.low); + TensorUInt128 result(0); + while (power2 != TensorUInt128, static_val<0>>(0)) { + if (tmp >= d) { + tmp = tmp - d; + result = result + power2; + } + // Shift right + power2 = TensorUInt128(power2.high >> 1, (power2.low >> 1) | (power2.high << 63)); + d = TensorUInt128(d.high >> 1, (d.low >> 1) | (d.high << 63)); + } + + return result; + } +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h new file mode 100644 index 00000000000..2f0c1354167 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -0,0 +1,619 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorVolumePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for processing of volumetric data. + * This assumes that the input has a least 4 dimensions ordered as follows: + * - channels + * - planes + * - rows + * - columns + * - (optional) additional dimensions such as time or batch size. + * Calling the volume patch code with patch_planes, patch_rows, and patch_cols + * is equivalent to calling the regular patch extraction code with parameters + * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional + * dimensions. + */ +namespace internal { + +template +struct traits > : public traits { + typedef std::remove_const_t Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions + 1; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorVolumePatchOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorVolumePatchOp type; +}; + +} // end namespace internal + +template +class TensorVolumePatchOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp( + const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, DenseIndex in_plane_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, DenseIndex plane_inflate_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, PaddingType padding_type, Scalar padding_value) + : m_xpr(expr), + m_patch_planes(patch_planes), + m_patch_rows(patch_rows), + m_patch_cols(patch_cols), + m_plane_strides(plane_strides), + m_row_strides(row_strides), + m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), + m_in_row_strides(in_row_strides), + m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), + m_row_inflate_strides(row_inflate_strides), + m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), + m_padding_top_z(0), + m_padding_bottom_z(0), + m_padding_top(0), + m_padding_bottom(0), + m_padding_left(0), + m_padding_right(0), + m_padding_type(padding_type), + m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp( + const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, DenseIndex in_plane_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, DenseIndex plane_inflate_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, DenseIndex padding_top_z, + DenseIndex padding_bottom_z, DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left, + DenseIndex padding_right, Scalar padding_value) + : m_xpr(expr), + m_patch_planes(patch_planes), + m_patch_rows(patch_rows), + m_patch_cols(patch_cols), + m_plane_strides(plane_strides), + m_row_strides(row_strides), + m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), + m_in_row_strides(in_row_strides), + m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), + m_row_inflate_strides(row_inflate_strides), + m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), + m_padding_top_z(padding_top_z), + m_padding_bottom_z(padding_bottom_z), + m_padding_top(padding_top), + m_padding_bottom(padding_bottom), + m_padding_left(padding_left), + m_padding_right(padding_right), + m_padding_type(PADDING_VALID), + m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC DenseIndex patch_planes() const { return m_patch_planes; } + EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC DenseIndex plane_strides() const { return m_plane_strides; } + EIGEN_DEVICE_FUNC DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC DenseIndex in_plane_strides() const { return m_in_plane_strides; } + EIGEN_DEVICE_FUNC DenseIndex in_row_strides() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC DenseIndex in_col_strides() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; } + EIGEN_DEVICE_FUNC DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC bool padding_explicit() const { return m_padding_explicit; } + EIGEN_DEVICE_FUNC DenseIndex padding_top_z() const { return m_padding_top_z; } + EIGEN_DEVICE_FUNC DenseIndex padding_bottom_z() const { return m_padding_bottom_z; } + EIGEN_DEVICE_FUNC DenseIndex padding_top() const { return m_padding_top; } + EIGEN_DEVICE_FUNC DenseIndex padding_bottom() const { return m_padding_bottom; } + EIGEN_DEVICE_FUNC DenseIndex padding_left() const { return m_padding_left; } + EIGEN_DEVICE_FUNC DenseIndex padding_right() const { return m_padding_right; } + EIGEN_DEVICE_FUNC PaddingType padding_type() const { return m_padding_type; } + EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_planes; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_plane_strides; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const DenseIndex m_in_plane_strides; + const DenseIndex m_in_row_strides; + const DenseIndex m_in_col_strides; + const DenseIndex m_plane_inflate_strides; + const DenseIndex m_row_inflate_strides; + const DenseIndex m_col_inflate_strides; + const bool m_padding_explicit; + const DenseIndex m_padding_top_z; + const DenseIndex m_padding_bottom_z; + const DenseIndex m_padding_top; + const DenseIndex m_padding_bottom; + const DenseIndex m_padding_left; + const DenseIndex m_padding_right; + const PaddingType m_padding_type; + const Scalar m_padding_value; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorVolumePatchOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + static constexpr int NumDims = NumInputDims + 1; + typedef DSizes Dimensions; + typedef std::remove_const_t Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); + + m_paddingValue = op.padding_value(); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // Cache a few variables. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputDepth = input_dims[0]; + m_inputPlanes = input_dims[1]; + m_inputRows = input_dims[2]; + m_inputCols = input_dims[3]; + } else { + m_inputDepth = input_dims[NumInputDims - 1]; + m_inputPlanes = input_dims[NumInputDims - 2]; + m_inputRows = input_dims[NumInputDims - 3]; + m_inputCols = input_dims[NumInputDims - 4]; + } + + m_plane_strides = op.plane_strides(); + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // Input strides and effective input/patch size + m_in_plane_strides = op.in_plane_strides(); + m_in_row_strides = op.in_row_strides(); + m_in_col_strides = op.in_col_strides(); + m_plane_inflate_strides = op.plane_inflate_strides(); + m_row_inflate_strides = op.row_inflate_strides(); + m_col_inflate_strides = op.col_inflate_strides(); + + // The "effective" spatial size after inflating data with zeros. + m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1; + m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; + m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; + m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1); + m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); + m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); + + if (op.padding_explicit()) { + m_outputPlanes = + numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / + static_cast(m_plane_strides)); + m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / + static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / + static_cast(m_col_strides)); + m_planePaddingTop = op.padding_top_z(); + m_rowPaddingTop = op.padding_top(); + m_colPaddingLeft = op.padding_left(); + } else { + // Computing padding from the type + switch (op.padding_type()) { + case PADDING_VALID: + m_outputPlanes = + numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast(m_plane_strides)); + m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + m_planePaddingTop = 0; + m_rowPaddingTop = 0; + m_colPaddingLeft = 0; + break; + case PADDING_SAME: { + m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast(m_plane_strides)); + m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); + m_outputCols = numext::ceil(m_input_cols_eff / static_cast(m_col_strides)); + const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff; + const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff; + const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff; + m_planePaddingTop = dz / 2; + m_rowPaddingTop = dy / 2; + m_colPaddingLeft = dx / 2; + break; + } + default: { + eigen_assert(false && "unexpected padding"); + return; + } + } + } + eigen_assert(m_outputRows > 0); + eigen_assert(m_outputCols > 0); + eigen_assert(m_outputPlanes > 0); + + // Dimensions for result of extraction. + if (static_cast(Layout) == static_cast(ColMajor)) { + // ColMajor + // 0: depth + // 1: patch_planes + // 2: patch_rows + // 3: patch_cols + // 4: number of patches + // 5 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_planes(); + m_dimensions[2] = op.patch_rows(); + m_dimensions[3] = op.patch_cols(); + m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols; + for (int i = 5; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i - 1]; + } + } else { + // RowMajor + // NumDims-1: depth + // NumDims-2: patch_planes + // NumDims-3: patch_rows + // NumDims-4: patch_cols + // NumDims-5: number of patches + // NumDims-6 and beyond: anything else (such as batch). + m_dimensions[NumDims - 1] = input_dims[NumInputDims - 1]; + m_dimensions[NumDims - 2] = op.patch_planes(); + m_dimensions[NumDims - 3] = op.patch_rows(); + m_dimensions[NumDims - 4] = op.patch_cols(); + m_dimensions[NumDims - 5] = m_outputPlanes * m_outputRows * m_outputCols; + for (int i = NumDims - 6; i >= 0; --i) { + m_dimensions[i] = input_dims[i]; + } + } + + // Strides for the output tensor. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_rowStride = m_dimensions[1]; + m_colStride = m_dimensions[2] * m_rowStride; + m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[4]; + } else { + m_rowStride = m_dimensions[NumDims - 2]; + m_colStride = m_dimensions[NumDims - 3] * m_rowStride; + m_patchStride = m_colStride * m_dimensions[NumDims - 4] * m_dimensions[NumDims - 1]; + m_otherStride = m_patchStride * m_dimensions[NumDims - 5]; + } + + // Strides for navigating through the input tensor. + m_planeInputStride = m_inputDepth; + m_rowInputStride = m_inputDepth * m_inputPlanes; + m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes; + m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes; + + m_outputPlanesRows = m_outputPlanes * m_outputRows; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastRowStride = internal::TensorIntDivisor(m_rowStride); + m_fastInputRowStride = internal::TensorIntDivisor(m_row_inflate_strides); + m_fastInputColStride = internal::TensorIntDivisor(m_col_inflate_strides); + m_fastInputPlaneStride = internal::TensorIntDivisor(m_plane_inflate_strides); + m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); + m_fastOutputPlanes = internal::TensorIntDivisor(m_outputPlanes); + m_fastOutputPlanesRows = internal::TensorIntDivisor(m_outputPlanesRows); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[0]); + } else { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims - 1]); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + + // Spatial offset within the patch. This has to be translated into 3D + // coordinates within the patch. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; + + // Batch, etc. + const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride; + const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + // Calculate column index in the input original tensor. + const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = + (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate row index in the original input tensor. + const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; + const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = + (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate plane index in the original input tensor. + const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex)); + const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride; + const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop; + const Index origInputPlane = + (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0); + if (inputPlane < 0 || inputPlane >= m_input_planes_eff || + ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + + const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + + origInputPlane * m_planeInputStride + otherIndex * m_otherInputStride; + + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 || + m_in_plane_strides != 1 || m_plane_inflate_strides != 1) { + return packetWithPossibleZero(index); + } + + const Index indices[2] = {index, index + PacketSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + + const Index patch3DIndex = + (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft, + colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputCols[0] != inputCols[1]) { + return packetWithPossibleZero(index); + } + + const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; + const Index rowOffsets[2] = {(patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride, + (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop, + rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputRows[0] != inputRows[1]) { + return packetWithPossibleZero(index); + } + + const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex)); + const Index planeOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride, + patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride}; + eigen_assert(planeOffsets[0] <= planeOffsets[1]); + const Index inputPlanes[2] = {planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop, + planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop}; + + if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) { + // no padding + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + + m_planeInputStride * inputPlanes[0] + otherIndex * m_otherInputStride; + return m_impl.template packet(inputIndex); + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = + 10 * TensorOpCost::DivCost() + 21 * TensorOpCost::MulCost() + 8 * TensorOpCost::AddCost(); + return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + const TensorEvaluator& impl() const { return m_impl; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + + // Parameters passed to the constructor. + Index m_plane_strides; + Index m_row_strides; + Index m_col_strides; + + Index m_outputPlanes; + Index m_outputRows; + Index m_outputCols; + + Index m_planePaddingTop; + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + Index m_in_plane_strides; + Index m_in_row_strides; + Index m_in_col_strides; + + Index m_plane_inflate_strides; + Index m_row_inflate_strides; + Index m_col_inflate_strides; + + // Cached input size. + Index m_inputDepth; + Index m_inputPlanes; + Index m_inputRows; + Index m_inputCols; + + // Other cached variables. + Index m_outputPlanesRows; + + // Effective input/patch post-inflation size. + Index m_input_planes_eff; + Index m_input_rows_eff; + Index m_input_cols_eff; + Index m_patch_planes_eff; + Index m_patch_rows_eff; + Index m_patch_cols_eff; + + // Strides for the output tensor. + Index m_otherStride; + Index m_patchStride; + Index m_rowStride; + Index m_colStride; + + // Strides for the input tensor. + Index m_planeInputStride; + Index m_rowInputStride; + Index m_colInputStride; + Index m_otherInputStride; + + internal::TensorIntDivisor m_fastOtherStride; + internal::TensorIntDivisor m_fastPatchStride; + internal::TensorIntDivisor m_fastColStride; + internal::TensorIntDivisor m_fastRowStride; + internal::TensorIntDivisor m_fastInputPlaneStride; + internal::TensorIntDivisor m_fastInputRowStride; + internal::TensorIntDivisor m_fastInputColStride; + internal::TensorIntDivisor m_fastInputColsEff; + internal::TensorIntDivisor m_fastOutputPlanesRows; + internal::TensorIntDivisor m_fastOutputPlanes; + internal::TensorIntDivisor m_fastOutputDepth; + + Scalar m_paddingValue; + + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h new file mode 100644 index 00000000000..51c0ad6658a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -0,0 +1,296 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +class DynamicSGroup { + public: + inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { + m_elements.push_back(ge(Generator(0, 0, 0))); + } + inline DynamicSGroup(const DynamicSGroup& o) + : m_numIndices(o.m_numIndices), + m_elements(o.m_elements), + m_generators(o.m_generators), + m_globalFlags(o.m_globalFlags) {} + inline DynamicSGroup(DynamicSGroup&& o) + : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { + std::swap(m_elements, o.m_elements); + } + inline DynamicSGroup& operator=(const DynamicSGroup& o) { + m_numIndices = o.m_numIndices; + m_elements = o.m_elements; + m_generators = o.m_generators; + m_globalFlags = o.m_globalFlags; + return *this; + } + inline DynamicSGroup& operator=(DynamicSGroup&& o) { + m_numIndices = o.m_numIndices; + std::swap(m_elements, o.m_elements); + m_generators = o.m_generators; + m_globalFlags = o.m_globalFlags; + return *this; + } + + void add(int one, int two, int flags = 0); + + template + inline void add(Gen_) { + add(Gen_::One, Gen_::Two, Gen_::Flags); + } + inline void addSymmetry(int one, int two) { add(one, two, 0); } + inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); } + inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); } + inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); } + + template + inline RV apply(const std::array& idx, RV initial, Args&&... args) const { + eigen_assert(N >= m_numIndices && + "Can only apply symmetry group to objects that have at least the required amount of indices."); + for (std::size_t i = 0; i < size(); i++) + initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list::type()), m_elements[i].flags, + initial, std::forward(args)...); + return initial; + } + + template + inline RV apply(const std::vector& idx, RV initial, Args&&... args) const { + eigen_assert(idx.size() >= m_numIndices && + "Can only apply symmetry group to objects that have at least the required amount of indices."); + for (std::size_t i = 0; i < size(); i++) + initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward(args)...); + return initial; + } + + inline int globalFlags() const { return m_globalFlags; } + inline std::size_t size() const { return m_elements.size(); } + + template + inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, + typename Tensor_::Index firstIndex, + IndexTypes... otherIndices) const { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, + "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + } + + template + inline internal::tensor_symmetry_value_setter operator()( + Tensor_& tensor, std::array const& indices) const { + return internal::tensor_symmetry_value_setter(tensor, *this, indices); + } + + private: + struct GroupElement { + std::vector representation; + int flags; + bool isId() const { + for (std::size_t i = 0; i < representation.size(); i++) + if (i != (size_t)representation[i]) return false; + return true; + } + }; + struct Generator { + int one; + int two; + int flags; + constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {} + }; + + std::size_t m_numIndices; + std::vector m_elements; + std::vector m_generators; + int m_globalFlags; + + template + inline std::array h_permute(std::size_t which, const std::array& idx, + internal::numeric_list) const { + return std::array{{idx[n >= m_numIndices ? n : m_elements[which].representation[n]]...}}; + } + + template + inline std::vector h_permute(std::size_t which, std::vector idx) const { + std::vector result; + result.reserve(idx.size()); + for (auto k : m_elements[which].representation) result.push_back(idx[k]); + for (std::size_t i = m_numIndices; i < idx.size(); i++) result.push_back(idx[i]); + return result; + } + + inline GroupElement ge(Generator const& g) const { + GroupElement result; + result.representation.reserve(m_numIndices); + result.flags = g.flags; + for (std::size_t k = 0; k < m_numIndices; k++) { + if (k == (std::size_t)g.one) + result.representation.push_back(g.two); + else if (k == (std::size_t)g.two) + result.representation.push_back(g.one); + else + result.representation.push_back(int(k)); + } + return result; + } + + GroupElement mul(GroupElement, GroupElement) const; + inline GroupElement mul(Generator g1, GroupElement g2) const { return mul(ge(g1), g2); } + + inline GroupElement mul(GroupElement g1, Generator g2) const { return mul(g1, ge(g2)); } + + inline GroupElement mul(Generator g1, Generator g2) const { return mul(ge(g1), ge(g2)); } + + inline int findElement(GroupElement e) const { + for (auto ee : m_elements) { + if (ee.representation == e.representation) return ee.flags ^ e.flags; + } + return -1; + } + + void updateGlobalFlags(int flagDiffOfSameGenerator); +}; + +// dynamic symmetry group that auto-adds the template parameters in the constructor +template +class DynamicSGroupFromTemplateArgs : public DynamicSGroup { + public: + inline DynamicSGroupFromTemplateArgs() : DynamicSGroup() { add_all(internal::type_list()); } + inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) {} + inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) {} + inline DynamicSGroupFromTemplateArgs& operator=(const DynamicSGroupFromTemplateArgs& o) { + DynamicSGroup::operator=(o); + return *this; + } + inline DynamicSGroupFromTemplateArgs& operator=(DynamicSGroupFromTemplateArgs&& o) { + DynamicSGroup::operator=(o); + return *this; + } + + private: + template + inline void add_all(internal::type_list) { + add(Gen1()); + add_all(internal::type_list()); + } + + inline void add_all(internal::type_list<>) {} +}; + +inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const { + eigen_internal_assert(g1.representation.size() == m_numIndices); + eigen_internal_assert(g2.representation.size() == m_numIndices); + + GroupElement result; + result.representation.reserve(m_numIndices); + for (std::size_t i = 0; i < m_numIndices; i++) { + int v = g2.representation[g1.representation[i]]; + eigen_assert(v >= 0); + result.representation.push_back(v); + } + result.flags = g1.flags ^ g2.flags; + return result; +} + +inline void DynamicSGroup::add(int one, int two, int flags) { + eigen_assert(one >= 0); + eigen_assert(two >= 0); + eigen_assert(one != two); + + if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) { + std::size_t newNumIndices = (one > two) ? one : two + 1; + for (auto& gelem : m_elements) { + gelem.representation.reserve(newNumIndices); + for (std::size_t i = m_numIndices; i < newNumIndices; i++) gelem.representation.push_back(i); + } + m_numIndices = newNumIndices; + } + + Generator g{one, two, flags}; + GroupElement e = ge(g); + + /* special case for first generator */ + if (m_elements.size() == 1) { + while (!e.isId()) { + m_elements.push_back(e); + e = mul(e, g); + } + + if (e.flags > 0) updateGlobalFlags(e.flags); + + // only add in case we didn't have identity + if (m_elements.size() > 1) m_generators.push_back(g); + return; + } + + int p = findElement(e); + if (p >= 0) { + updateGlobalFlags(p); + return; + } + + std::size_t coset_order = m_elements.size(); + m_elements.push_back(e); + for (std::size_t i = 1; i < coset_order; i++) m_elements.push_back(mul(m_elements[i], e)); + m_generators.push_back(g); + + std::size_t coset_rep = coset_order; + do { + for (auto g : m_generators) { + e = mul(m_elements[coset_rep], g); + p = findElement(e); + if (p < 0) { + // element not yet in group + m_elements.push_back(e); + for (std::size_t i = 1; i < coset_order; i++) m_elements.push_back(mul(m_elements[i], e)); + } else if (p > 0) { + updateGlobalFlags(p); + } + } + coset_rep += coset_order; + } while (coset_rep < m_elements.size()); +} + +inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator) { + switch (flagDiffOfSameGenerator) { + case 0: + default: + // nothing happened + break; + case NegationFlag: + // every element is it's own negative => whole tensor is zero + m_globalFlags |= GlobalZeroFlag; + break; + case ConjugationFlag: + // every element is it's own conjugate => whole tensor is real + m_globalFlags |= GlobalRealFlag; + break; + case (NegationFlag | ConjugationFlag): + // every element is it's own negative conjugate => whole tensor is imaginary + m_globalFlags |= GlobalImagFlag; + break; + /* NOTE: + * since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator + * causes the tensor to be real and the next one to be imaginary, this will + * trivially give the correct result + */ + } +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h new file mode 100644 index 00000000000..b1b2e14c6e6 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h @@ -0,0 +1,4 @@ +#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H +#error \ + "Please include unsupported/Eigen/CXX11/TensorSymmetry instead of including headers inside the src directory directly." +#endif diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h new file mode 100644 index 00000000000..3f9bb51e716 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -0,0 +1,223 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +template +struct tensor_static_symgroup_permutate; + +template +struct tensor_static_symgroup_permutate> { + constexpr static std::size_t N = sizeof...(nn); + + template + constexpr static inline std::array run(const std::array& indices) { + return {{indices[nn]...}}; + } +}; + +template +struct tensor_static_symgroup_element { + typedef indices_ indices; + constexpr static int flags = flags_; +}; + +template +struct tensor_static_symgroup_element_ctor { + typedef tensor_static_symgroup_element::type, + Gen::Flags> + type; +}; + +template +struct tensor_static_symgroup_identity_ctor { + typedef tensor_static_symgroup_element::type, 0> type; +}; + +template +struct tensor_static_symgroup_multiply_helper { + template + constexpr static inline numeric_list::value...> helper(numeric_list) { + return numeric_list::value...>(); + } +}; + +template +struct tensor_static_symgroup_multiply { + private: + typedef typename A::indices iia; + typedef typename B::indices iib; + constexpr static int ffa = A::flags; + constexpr static int ffb = B::flags; + + public: + static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices."); + + typedef tensor_static_symgroup_element::helper(iia())), + ffa ^ ffb> + type; +}; + +template +struct tensor_static_symgroup_equality { + typedef typename A::indices iia; + typedef typename B::indices iib; + constexpr static int ffa = A::flags; + constexpr static int ffb = B::flags; + static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices."); + + constexpr static bool value = is_same::value; + + private: + /* this should be zero if they are identical, or else the tensor + * will be forced to be pure real, pure imaginary or even pure zero + */ + constexpr static int flags_cmp_ = ffa ^ ffb; + + /* either they are not equal, then we don't care whether the flags + * match, or they are equal, and then we have to check + */ + constexpr static bool is_zero = value && flags_cmp_ == NegationFlag; + constexpr static bool is_real = value && flags_cmp_ == ConjugationFlag; + constexpr static bool is_imag = value && flags_cmp_ == (NegationFlag | ConjugationFlag); + + public: + constexpr static int global_flags = + (is_real ? GlobalRealFlag : 0) | (is_imag ? GlobalImagFlag : 0) | (is_zero ? GlobalZeroFlag : 0); +}; + +template +struct tensor_static_symgroup { + typedef StaticSGroup type; + constexpr static std::size_t size = type::static_size; +}; + +template +constexpr static inline std::array tensor_static_symgroup_index_permute(std::array idx, + internal::numeric_list, + internal::numeric_list) { + return {{idx[ii]..., idx[jj]...}}; +} + +template +static inline std::vector tensor_static_symgroup_index_permute(std::vector idx, + internal::numeric_list) { + std::vector result{{idx[ii]...}}; + std::size_t target_size = idx.size(); + for (std::size_t i = result.size(); i < target_size; i++) result.push_back(idx[i]); + return result; +} + +template +struct tensor_static_symgroup_do_apply; + +template +struct tensor_static_symgroup_do_apply> { + template + static inline RV run(const std::array& idx, RV initial, Args&&... args) { + static_assert(NumIndices >= SGNumIndices, + "Can only apply symmetry group to objects that have at least the required amount of indices."); + typedef typename internal::gen_numeric_list::type remaining_indices; + initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), + first::flags, initial, std::forward(args)...); + return tensor_static_symgroup_do_apply>::template run( + idx, initial, args...); + } + + template + static inline RV run(const std::vector& idx, RV initial, Args&&... args) { + eigen_assert(idx.size() >= SGNumIndices && + "Can only apply symmetry group to objects that have at least the required amount of indices."); + initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, + std::forward(args)...); + return tensor_static_symgroup_do_apply>::template run( + idx, initial, args...); + } +}; + +template +struct tensor_static_symgroup_do_apply> { + template + static inline RV run(const std::array&, RV initial, Args&&...) { + // do nothing + return initial; + } + + template + static inline RV run(const std::vector&, RV initial, Args&&...) { + // do nothing + return initial; + } +}; + +} // end namespace internal + +template +class StaticSGroup { + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices::value; + typedef internal::group_theory::enumerate_group_elements< + internal::tensor_static_symgroup_multiply, internal::tensor_static_symgroup_equality, + typename internal::tensor_static_symgroup_identity_ctor::type, + internal::type_list::type...>> + group_elements; + typedef typename group_elements::type ge; + + public: + constexpr inline StaticSGroup() {} + constexpr inline StaticSGroup(const StaticSGroup&) {} + constexpr inline StaticSGroup(StaticSGroup&&) {} + + template + static inline RV apply(const std::array& idx, RV initial, Args&&... args) { + return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); + } + + template + static inline RV apply(const std::vector& idx, RV initial, Args&&... args) { + eigen_assert(idx.size() == NumIndices); + return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); + } + + constexpr static std::size_t static_size = ge::count; + + constexpr static inline std::size_t size() { return ge::count; } + constexpr static inline int globalFlags() { return group_elements::global_flags; } + + template + inline internal::tensor_symmetry_value_setter> operator()( + Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, + "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + } + + template + inline internal::tensor_symmetry_value_setter> operator()( + Tensor_& tensor, std::array const& indices) const { + return internal::tensor_symmetry_value_setter>(tensor, *this, indices); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h new file mode 100644 index 00000000000..2d3ff466f37 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h @@ -0,0 +1,335 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +enum { NegationFlag = 0x01, ConjugationFlag = 0x02 }; + +enum { GlobalRealFlag = 0x01, GlobalImagFlag = 0x02, GlobalZeroFlag = 0x03 }; + +namespace internal { + +template +struct tensor_symmetry_pre_analysis; +template +struct tensor_static_symgroup; +template +struct tensor_static_symgroup_if; +template +struct tensor_symmetry_calculate_flags; +template +struct tensor_symmetry_assign_value; +template +struct tensor_symmetry_num_indices; + +} // end namespace internal + +template +struct Symmetry { + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = 0; +}; + +template +struct AntiSymmetry { + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = NegationFlag; +}; + +template +struct Hermiticity { + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = ConjugationFlag; +}; + +template +struct AntiHermiticity { + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = ConjugationFlag | NegationFlag; +}; + +/** \class DynamicSGroup + * \ingroup TensorSymmetry_Module + * + * \brief Dynamic symmetry group + * + * The %DynamicSGroup class represents a symmetry group that need not be known at + * compile time. It is useful if one wants to support arbitrary run-time defineable + * symmetries for tensors, but it is also instantiated if a symmetry group is defined + * at compile time that would be either too large for the compiler to reasonably + * generate (using templates to calculate this at compile time is very inefficient) + * or that the compiler could generate the group but that it wouldn't make sense to + * unroll the loop for setting coefficients anymore. + */ +class DynamicSGroup; + +/** \internal + * + * \class DynamicSGroupFromTemplateArgs + * \ingroup TensorSymmetry_Module + * + * \brief Dynamic symmetry group, initialized from template arguments + * + * This class is a child class of DynamicSGroup. It uses the template arguments + * specified to initialize itself. + */ +template +class DynamicSGroupFromTemplateArgs; + +/** \class StaticSGroup + * \ingroup TensorSymmetry_Module + * + * \brief Static symmetry group + * + * This class represents a symmetry group that is known and resolved completely + * at compile time. Ideally, no run-time penalty is incurred compared to the + * manual unrolling of the symmetry. + * + * CAUTION: + * + * Do not use this class directly for large symmetry groups. The compiler + * may run into a limit, or segfault or in the very least will take a very, + * very, very long time to compile the code. Use the SGroup class instead + * if you want a static group. That class contains logic that will + * automatically select the DynamicSGroup class instead if the symmetry + * group becomes too large. (In that case, unrolling may not even be + * beneficial.) + */ +template +class StaticSGroup; + +/** \class SGroup + * \ingroup TensorSymmetry_Module + * + * \brief Symmetry group, initialized from template arguments + * + * This class represents a symmetry group whose generators are already + * known at compile time. It may or may not be resolved at compile time, + * depending on the estimated size of the group. + * + * \sa StaticSGroup + * \sa DynamicSGroup + */ +template +class SGroup : public internal::tensor_symmetry_pre_analysis::value, + Gen...>::root_type { + public: + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices::value; + typedef typename internal::tensor_symmetry_pre_analysis::root_type Base; + + // make standard constructors + assignment operators public + inline SGroup() : Base() {} + inline SGroup(const SGroup& other) : Base(other) {} + inline SGroup(SGroup&& other) : Base(other) {} + inline SGroup& operator=(const SGroup& other) { + Base::operator=(other); + return *this; + } + inline SGroup& operator=(SGroup&& other) { + Base::operator=(other); + return *this; + } + + // all else is defined in the base class +}; + +namespace internal { + +template +struct tensor_symmetry_num_indices { + constexpr static std::size_t value = 1; +}; + +template +struct tensor_symmetry_num_indices, Sym...> { + private: + constexpr static std::size_t One = static_cast(One_); + constexpr static std::size_t Two = static_cast(Two_); + constexpr static std::size_t Three = tensor_symmetry_num_indices::value; + + // don't use std::max, since it's not constexpr until C++14... + constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1; + + public: + constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three; +}; + +template +struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; +template +struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; +template +struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; + +/** \internal + * + * \class tensor_symmetry_pre_analysis + * \ingroup TensorSymmetry_Module + * + * \brief Pre-select whether to use a static or dynamic symmetry group + * + * When a symmetry group could in principle be determined at compile time, + * this template implements the logic whether to actually do that or whether + * to rather defer that to runtime. + * + * The logic is as follows: + *
+ *
No generators (trivial symmetry):
+ *
Use a trivial static group. Ideally, this has no performance impact + * compared to not using symmetry at all. In practice, this might not + * be the case.
+ *
More than 4 generators:
+ *
Calculate the group at run time, it is likely far too large for the + * compiler to be able to properly generate it in a realistic time.
+ *
Up to and including 4 generators:
+ *
Actually enumerate all group elements, but then check how many there + * are. If there are more than 16, it is unlikely that unrolling the + * loop (as is done in the static compile-time case) is sensible, so + * use a dynamic group instead. If there are at most 16 elements, actually + * use that static group. Note that the largest group with 4 generators + * still compiles with reasonable resources.
+ *
+ * + * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470 + * with 16 GiB RAM (all generators non-redundant and the subgroups don't + * factorize): + * + * # Generators -O0 -ggdb -O2 + * ------------------------------------------------------------------- + * 1 0.5 s / 250 MiB 0.45s / 230 MiB + * 2 0.5 s / 260 MiB 0.5 s / 250 MiB + * 3 0.65s / 310 MiB 0.62s / 310 MiB + * 4 2.2 s / 860 MiB 1.7 s / 770 MiB + * 5 130 s / 13000 MiB 120 s / 11000 MiB + * + * It is clear that everything is still very efficient up to 4 generators, then + * the memory and CPU requirements become unreasonable. Thus we only instantiate + * the template group theory logic if the number of generators supplied is 4 or + * lower, otherwise this will be forced to be done during runtime, where the + * algorithm is reasonably fast. + */ +template +struct tensor_symmetry_pre_analysis { + typedef StaticSGroup<> root_type; +}; + +template +struct tensor_symmetry_pre_analysis { + constexpr static std::size_t max_static_generators = 4; + constexpr static std::size_t max_static_elements = 16; + typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper; + constexpr static std::size_t possible_size = helper::size; + + typedef std::conditional_t= max_static_elements, + DynamicSGroupFromTemplateArgs, typename helper::type> + root_type; +}; + +template +struct tensor_static_symgroup_if { + constexpr static std::size_t size = 0; + typedef void type; +}; + +template +struct tensor_static_symgroup_if : tensor_static_symgroup {}; + +template +struct tensor_symmetry_assign_value { + typedef typename Tensor_::Index Index; + typedef typename Tensor_::Scalar Scalar; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + static inline int run(const std::array& transformed_indices, int transformation_flags, int dummy, + Tensor_& tensor, const Scalar& value_) { + Scalar value(value_); + if (transformation_flags & ConjugationFlag) value = numext::conj(value); + if (transformation_flags & NegationFlag) value = -value; + tensor.coeffRef(transformed_indices) = value; + return dummy; + } +}; + +template +struct tensor_symmetry_calculate_flags { + typedef typename Tensor_::Index Index; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + static inline int run(const std::array& transformed_indices, int transform_flags, + int current_flags, const std::array& orig_indices) { + if (transformed_indices == orig_indices) { + if (transform_flags & (ConjugationFlag | NegationFlag)) + return current_flags | GlobalImagFlag; // anti-hermitian diagonal + else if (transform_flags & ConjugationFlag) + return current_flags | GlobalRealFlag; // hermitian diagonal + else if (transform_flags & NegationFlag) + return current_flags | GlobalZeroFlag; // anti-symmetric diagonal + } + return current_flags; + } +}; + +template +class tensor_symmetry_value_setter { + public: + typedef typename Tensor_::Index Index; + typedef typename Tensor_::Scalar Scalar; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, + std::array const& indices) + : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) {} + + inline tensor_symmetry_value_setter& operator=(Scalar const& value) { + doAssign(value); + return *this; + } + + private: + Tensor_& m_tensor; + Symmetry_ m_symmetry; + std::array m_indices; + + inline void doAssign(Scalar const& value) { +#ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES + int value_flags = m_symmetry.template apply, int>( + m_indices, m_symmetry.globalFlags(), m_indices); + if (value_flags & GlobalRealFlag) eigen_assert(numext::imag(value) == 0); + if (value_flags & GlobalImagFlag) eigen_assert(numext::real(value) == 0); +#endif + m_symmetry.template apply, int>(m_indices, 0, m_tensor, value); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h new file mode 100644 index 00000000000..aa16f3cd448 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h @@ -0,0 +1,492 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H +#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H + +// IWYU pragma: private +#include "../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +namespace group_theory { + +/** \internal + * \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h + * This file contains C++ templates that implement group theory algorithms. + * + * The algorithms allow for a compile-time analysis of finite groups. + * + * Currently only Dimino's algorithm is implemented, which returns a list + * of all elements in a group given a set of (possibly redundant) generators. + * (One could also do that with the so-called orbital algorithm, but that + * is much more expensive and usually has no advantages.) + */ + +/********************************************************************** + * "Ok kid, here is where it gets complicated." + * - Amelia Pond in the "Doctor Who" episode + * "The Big Bang" + * + * Dimino's algorithm + * ================== + * + * The following is Dimino's algorithm in sequential form: + * + * Input: identity element, list of generators, equality check, + * multiplication operation + * Output: list of group elements + * + * 1. add identity element + * 2. remove identities from list of generators + * 3. add all powers of first generator that aren't the + * identity element + * 4. go through all remaining generators: + * a. if generator is already in the list of elements + * -> do nothing + * b. otherwise + * i. remember current # of elements + * (i.e. the size of the current subgroup) + * ii. add all current elements (which includes + * the identity) each multiplied from right + * with the current generator to the group + * iii. add all remaining cosets that are generated + * by products of the new generator with itself + * and all other generators seen so far + * + * In functional form, this is implemented as a long set of recursive + * templates that have a complicated relationship. + * + * The main interface for Dimino's algorithm is the template + * enumerate_group_elements. All lists are implemented as variadic + * type_list and numeric_list + * templates. + * + * 'Calling' templates is usually done via typedefs. + * + * This algorithm is an extended version of the basic version. The + * extension consists in the fact that each group element has a set + * of flags associated with it. Multiplication of two group elements + * with each other results in a group element whose flags are the + * XOR of the flags of the previous elements. Each time the algorithm + * notices that a group element it just calculated is already in the + * list of current elements, the flags of both will be compared and + * added to the so-called 'global flags' of the group. + * + * The rationale behind this extension is that this allows not only + * for the description of symmetries between tensor indices, but + * also allows for the description of hermiticity, antisymmetry and + * antihermiticity. Negation and conjugation each are specific bit + * in the flags value and if two different ways to reach a group + * element lead to two different flags, this poses a constraint on + * the allowed values of the resulting tensor. For example, if a + * group element is reach both with and without the conjugation + * flags, it is clear that the resulting tensor has to be real. + * + * Note that this flag mechanism is quite generic and may have other + * uses beyond tensor properties. + * + * IMPORTANT: + * This algorithm assumes the group to be finite. If you try to + * run it with a group that's infinite, the algorithm will only + * terminate once you hit a compiler limit (max template depth). + * Also note that trying to use this implementation to create a + * very large group will probably either make you hit the same + * limit, cause the compiler to segfault or at the very least + * take a *really* long time (hours, days, weeks - sic!) to + * compile. It is not recommended to plug in more than 4 + * generators, unless they are independent of each other. + */ + +/** \internal + * + * \class strip_identities + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Cleanse a list of group elements of the identity element + * + * This template is used to make a first pass through all initial + * generators of Dimino's algorithm and remove the identity + * elements. + * + * \sa enumerate_group_elements + */ +template