diff --git a/CMakeLists.txt b/CMakeLists.txt index 9448fc87bfa..90bd3b32f28 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,7 +145,8 @@ if(WITH_DOCS) endif() if(WITH_WPICAL) - find_package(Ceres CONFIG REQUIRED) + # find_package(Ceres CONFIG REQUIRED) + find_package (SuiteSparse 5.12 NO_MODULE) endif() find_package(LIBSSH CONFIG 0.7.1) diff --git a/build.gradle b/build.gradle index 2db70836038..e038893ce01 100644 --- a/build.gradle +++ b/build.gradle @@ -29,8 +29,8 @@ wpilibVersioning.releaseMode = project.hasProperty('releaseMode') allprojects { repositories { maven { - url = 'https://frcmaven.wpi.edu/artifactory/ex-mvn' - } + url = 'https://frcmaven.wpi.edu/artifactory/ex-mvn' + } } if (project.hasProperty('releaseMode')) { wpilibRepositories.addAllReleaseRepositories(it) diff --git a/shared/gtsam.gradle b/shared/gtsam.gradle new file mode 100644 index 00000000000..b4d582b9de2 --- /dev/null +++ b/shared/gtsam.gradle @@ -0,0 +1,13 @@ +nativeUtils { + nativeDependencyContainer { + gtsam(getNativeDependencyTypeClass('WPIStaticMavenDependency')) { + groupId = "edu.wpi.first.thirdparty.frc2025.gtsam" + artifactId = "gtsam-cpp" + headerClassifier = "headers" + sourceClassifier = "sources" + ext = "zip" + version = '4.3-2' + targetPlatforms.addAll(nativeUtils.wpi.platforms.desktopPlatforms) + } + } +} diff --git a/upstream_utils/eigen.py b/upstream_utils/eigen.py index 875c6dfbbae..0974b1b6b56 100755 --- a/upstream_utils/eigen.py +++ b/upstream_utils/eigen.py @@ -49,6 +49,7 @@ def eigen_inclusions(dp, f): modules = [ "Cholesky", "Core", + "Dense", "Eigenvalues", "Geometry", "Householder", @@ -86,7 +87,7 @@ def unsupported_inclusions(dp, f): return False # Include the MatrixFunctions module - return "MatrixFunctions" in abspath + return "MatrixFunctions" in abspath or "SpecialFunctions" in abspath or "Tensor" in abspath or abspath.endswith("SpecialFunctionsFunctors.h") def copy_upstream_src(wpilib_root): diff --git a/wpical/CMakeLists.txt b/wpical/CMakeLists.txt index f07d52755ac..b797e3a9e02 100644 --- a/wpical/CMakeLists.txt +++ b/wpical/CMakeLists.txt @@ -4,6 +4,9 @@ include(CompileWarnings) include(GenResources) include(LinkMacOSGUI) include(AddTest) +include(FetchContent) + +find_package (SuiteSparse 5.12 NO_MODULE) configure_file(src/main/generate/WPILibVersion.cpp.in WPILibVersion.cpp) generate_resources(src/main/native/resources generated/main/cpp WPIcal wpical wpical_resources_src) @@ -25,6 +28,26 @@ elseif(APPLE) set_source_files_properties(${APP_ICON_MACOSX} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources") endif() +set(GTSAM_COMPILE_OPTIONS_PRIVATE_DEBUG CACHE INTERNAL FORCE) +set(GTSAM_COMPILE_OPTIONS_PRIVATE_RELWITHDEBINFO CACHE INTERNAL FORCE) +set(GTSAM_ENABLE_BOOST_SERIALIZATION OFF) +set(GTSAM_WITH_TBB OFF) +set(GTSAM_USE_BOOST_FEATURES OFF) +set(GTSAM_BUILD_PYTHON OFF) +set(GTSAM_BUILD_EXAMPLES_ALWAYS OFF) +set(GTSAM_BUILD_TESTS OFF) +set(BUILD_SHARED_LIBS OFF) +set(GTSAM_FORCE_SHARED_LIB OFF) +set(GTSAM_FORCE_STATIC_LIB ON) + +include(FetchContent) +fetchcontent_declare( + gtsam + GIT_REPOSITORY https://github.com/mcm001/gtsam + GIT_TAG c24ca7951f1db4cfe4e23f368b2eda503de72176 +) +fetchcontent_makeavailable(gtsam) + add_executable( wpical ${wpical_src} @@ -38,7 +61,9 @@ wpilib_target_warnings(wpical) target_include_directories( wpical PRIVATE + /usr/local/include/ src/main/native/include + src/main/native/thirdparty/mrcal/generated src/main/native/thirdparty/libdogleg/include src/main/native/thirdparty/mrcal/include src/main/native/thirdparty/mrcal_java/include @@ -82,14 +107,16 @@ endif() target_compile_options(wpical PRIVATE ${compile_flags}) find_package(OpenCV REQUIRED) -find_package(Ceres CONFIG REQUIRED) target_link_libraries( wpical + gtsam apriltag ${OpenCV_LIBS} wpigui wpiutil - Ceres::ceres + wpimath + SuiteSparse::CHOLMOD + # Ceres::ceres ) if(WIN32) @@ -119,11 +146,13 @@ if(WITH_TESTS) ) target_link_libraries( wpical_test + gtsam googletest apriltag ${OpenCV_LIBS} wpigui wpiutil - Ceres::ceres + SuiteSparse::CHOLMOD + # Ceres::ceres ) endif() diff --git a/wpical/build.gradle b/wpical/build.gradle index eea250c9036..14ac1e47ada 100644 --- a/wpical/build.gradle +++ b/wpical/build.gradle @@ -31,6 +31,7 @@ def wpilibVersionFileInput = file("src/main/generate/WPILibVersion.cpp.in") def wpilibVersionFileOutput = file("$buildDir/generated/main/cpp/WPILibVersion.cpp") apply from: "${rootDir}/shared/ceres.gradle" +apply from: "${rootDir}/shared/gtsam.gradle" apply from: "${rootDir}/shared/opencv.gradle" task generateCppVersion() { @@ -156,12 +157,14 @@ model { it.buildable = false return } + lib project: ':wpimath', library: 'wpimath', linkage: 'static' lib project: ':wpiutil', library: 'wpiutil', linkage: 'static' lib project: ':wpigui', library: 'wpigui', linkage: 'static' lib project: ':thirdparty:imgui_suite', library: 'imguiSuite', linkage: 'static' lib project: ':apriltag', library: 'apriltag', linkage: 'static' - nativeUtils.useRequiredLibrary(it, 'ceres') nativeUtils.useRequiredLibrary(it, 'opencv_static') + nativeUtils.useRequiredLibrary(it, 'gtsam') + nativeUtils.useRequiredLibrary(it, 'ceres') it.cppCompiler.define 'GLOG_USE_GLOG_EXPORT' if (it.targetPlatform.operatingSystem.isWindows()) { it.cppCompiler.define('GLOG_DEPRECATED', '__declspec(deprecated)') @@ -175,6 +178,9 @@ model { } else if (it.targetPlatform.operatingSystem.isMacOsX()) { it.linker.args << '-framework' << 'Metal' << '-framework' << 'MetalKit' << '-framework' << 'Cocoa' << '-framework' << 'IOKit' << '-framework' << 'CoreFoundation' << '-framework' << 'CoreVideo' << '-framework' << 'QuartzCore' << '-framework' << 'Accelerate' } else { + it.cppCompiler.args.add("-Wno-deprecated-copy") + it.cppCompiler.args.add("-Wno-redundant-move") + it.linker.args << '-lX11' << "-lgfortran" if (it.targetPlatform.name.startsWith('linuxarm')) { it.linker.args << '-lGL' @@ -211,10 +217,13 @@ model { lib project: ':wpigui', library: 'wpigui', linkage: 'static' lib project: ':thirdparty:imgui_suite', library: 'imguiSuite', linkage: 'static' lib project: ':apriltag', library: 'apriltag', linkage: 'static' + lib project: ':wpimath', library: 'wpimath', linkage: 'static' nativeUtils.useRequiredLibrary(it, 'ceres') nativeUtils.useRequiredLibrary(it, 'opencv_static') + nativeUtils.useRequiredLibrary(it, 'gtsam') it.cppCompiler.define('PROJECT_ROOT_PATH', testResources) it.cppCompiler.define 'GLOG_USE_GLOG_EXPORT' + if (it.targetPlatform.name == nativeUtils.wpi.platforms.windowsarm64) { // https://developercommunity.visualstudio.com/t/-imp-std-init-once-complete-unresolved-external-sy/1684365#T-N10041864 it.linker.args << '/alternatename:__imp___std_init_once_complete=__imp_InitOnceComplete' @@ -240,6 +249,9 @@ model { if (it.targetPlatform.name.startsWith('linuxarm')) { it.linker.args << '-lGL' } + + it.cppCompiler.args.add("-Wno-deprecated-copy") + it.cppCompiler.args.add("-Wno-redundant-move") } } } diff --git a/wpical/src/main/native/cpp/cameracalibration.cpp b/wpical/src/main/native/cpp/cameracalibration.cpp index 270de38af92..b1ab23e1dee 100644 --- a/wpical/src/main/native/cpp/cameracalibration.cpp +++ b/wpical/src/main/native/cpp/cameracalibration.cpp @@ -312,7 +312,11 @@ int cameracalibration::calibrate(const std::string& input_video, cv::Size boardSize(board_width - 1, board_height - 1); cv::Size imagerSize(imagerWidthPixels, imagerHeightPixels); + int frameIdx = 0; while (video_capture.grab()) { + std::cout << "Frame index " << frameIdx << std::endl; + frameIdx++; + cv::Mat frame; video_capture.retrieve(frame); diff --git a/wpical/src/main/native/cpp/fieldcalibration.cpp b/wpical/src/main/native/cpp/fieldcalibration.cpp index 1f3ff6970a3..6b924dd2d76 100644 --- a/wpical/src/main/native/cpp/fieldcalibration.cpp +++ b/wpical/src/main/native/cpp/fieldcalibration.cpp @@ -16,7 +16,8 @@ #include #include -#include +#include +#include #include #include #include @@ -26,70 +27,9 @@ #include #include "apriltag.h" +#include "gtsam_meme/wpical_gtsam.h" #include "tag36h11.h" - -struct Pose { - Eigen::Vector3d p; - Eigen::Quaterniond q; - EIGEN_MAKE_ALIGNED_OPERATOR_NEW -}; - -struct Constraint { - int id_begin; - int id_end; - Pose t_begin_end; - EIGEN_MAKE_ALIGNED_OPERATOR_NEW -}; - -class PoseGraphError { - public: - explicit PoseGraphError(Pose t_ab_observed) - : m_t_ab_observed(std::move(t_ab_observed)) {} - - template - bool operator()(const T* const p_a_ptr, const T* const q_a_ptr, - const T* const p_b_ptr, const T* const q_b_ptr, - T* residuals_ptr) const { - // Tag A - Eigen::Map> p_a(p_a_ptr); - Eigen::Map> q_a(q_a_ptr); - - // Tag B - Eigen::Map> p_b(p_b_ptr); - Eigen::Map> q_b(q_b_ptr); - - // Rotation between tag A to tag B - Eigen::Quaternion q_a_inverse = q_a.conjugate(); - Eigen::Quaternion q_ab_estimated = q_a_inverse * q_b; - - // Displacement between tag A and tag B in tag A's frame - Eigen::Matrix p_ab_estimated = q_a_inverse * (p_b - p_a); - - // Error between the orientations - Eigen::Quaternion delta_q = - m_t_ab_observed.q.template cast() * q_ab_estimated.conjugate(); - - // Residuals - Eigen::Map> residuals(residuals_ptr); - residuals.template block<3, 1>(0, 0) = - p_ab_estimated - m_t_ab_observed.p.template cast(); - residuals.template block<3, 1>(3, 0) = T(2.0) * delta_q.vec(); - - return true; - } - - static ceres::CostFunction* Create(const Pose& t_ab_observed) { - return new ceres::AutoDiffCostFunction( - new PoseGraphError(t_ab_observed)); - } - - EIGEN_MAKE_ALIGNED_OPERATOR_NEW - - private: - const Pose m_t_ab_observed; -}; - -const double tagSizeMeters = 0.1651; +#include "cameracalibration.h" inline cameracalibration::CameraModel load_camera_model(std::string path) { Eigen::Matrix camera_matrix; @@ -146,8 +86,7 @@ inline cameracalibration::CameraModel load_camera_model(std::string path) { } } - cameracalibration::CameraModel camera_model{ - camera_matrix, camera_distortion, json_data["avg_reprojection_error"]}; + cameracalibration::CameraModel camera_model{camera_matrix, camera_distortion, -1}; return camera_model; } @@ -172,86 +111,10 @@ inline cameracalibration::CameraModel load_camera_model(wpi::json json_data) { } } - cameracalibration::CameraModel camera_model{ - camera_matrix, camera_distortion, json_data["avg_reprojection_error"]}; + cameracalibration::CameraModel camera_model{camera_matrix, camera_distortion}; return camera_model; } -inline std::map load_ideal_map(std::string path) { - std::ifstream file(path); - wpi::json json_data = wpi::json::parse(file); - std::map ideal_map; - - for (const auto& element : json_data["tags"]) { - ideal_map[element["ID"]] = element; - } - - return ideal_map; -} - -Eigen::Matrix get_tag_transform( - std::map& ideal_map, int tag_id) { - Eigen::Matrix transform = - Eigen::Matrix::Identity(); - - transform.block<3, 3>(0, 0) = - Eigen::Quaternion( - ideal_map[tag_id]["pose"]["rotation"]["quaternion"]["W"], - ideal_map[tag_id]["pose"]["rotation"]["quaternion"]["X"], - ideal_map[tag_id]["pose"]["rotation"]["quaternion"]["Y"], - ideal_map[tag_id]["pose"]["rotation"]["quaternion"]["Z"]) - .toRotationMatrix(); - - transform(0, 3) = ideal_map[tag_id]["pose"]["translation"]["x"]; - transform(1, 3) = ideal_map[tag_id]["pose"]["translation"]["y"]; - transform(2, 3) = ideal_map[tag_id]["pose"]["translation"]["z"]; - - return transform; -} - -inline Eigen::Matrix estimate_tag_pose( - apriltag_detection_t* tag_detection, - const Eigen::Matrix& camera_matrix, - const Eigen::Matrix& camera_distortion, double tag_size) { - cv::Mat camera_matrix_cv; - cv::Mat camera_distortion_cv; - - cv::eigen2cv(camera_matrix, camera_matrix_cv); - cv::eigen2cv(camera_distortion, camera_distortion_cv); - - std::vector points_2d = { - cv::Point2f(tag_detection->p[0][0], tag_detection->p[0][1]), - cv::Point2f(tag_detection->p[1][0], tag_detection->p[1][1]), - cv::Point2f(tag_detection->p[2][0], tag_detection->p[2][1]), - cv::Point2f(tag_detection->p[3][0], tag_detection->p[3][1])}; - - std::vector points_3d_box_base = { - cv::Point3f(-tag_size / 2.0, tag_size / 2.0, 0.0), - cv::Point3f(tag_size / 2.0, tag_size / 2.0, 0.0), - cv::Point3f(tag_size / 2.0, -tag_size / 2.0, 0.0), - cv::Point3f(-tag_size / 2.0, -tag_size / 2.0, 0.0)}; - - std::vector r_vec; - std::vector t_vec; - - cv::solvePnP(points_3d_box_base, points_2d, camera_matrix_cv, - camera_distortion_cv, r_vec, t_vec, false, cv::SOLVEPNP_SQPNP); - - cv::Mat r_mat; - cv::Rodrigues(r_vec, r_mat); - - Eigen::Matrix camera_to_tag{ - {r_mat.at(0, 0), r_mat.at(0, 1), r_mat.at(0, 2), - t_vec.at(0)}, - {r_mat.at(1, 0), r_mat.at(1, 1), r_mat.at(1, 2), - t_vec.at(1)}, - {r_mat.at(2, 0), r_mat.at(2, 1), r_mat.at(2, 2), - t_vec.at(2)}, - {0.0, 0.0, 0.0, 1.0}}; - - return camera_to_tag; -} - inline void draw_tag_cube(cv::Mat& frame, Eigen::Matrix camera_to_tag, const Eigen::Matrix& camera_matrix, @@ -312,15 +175,18 @@ inline void draw_tag_cube(cv::Mat& frame, } } +/** + * Convert a video file to a list of keyframes + */ inline bool process_video_file( apriltag_detector_t* tag_detector, const Eigen::Matrix& camera_matrix, const Eigen::Matrix& camera_distortion, double tag_size, - const std::string& path, - std::map, - Eigen::aligned_allocator>>& poses, - std::vector>& constraints, - bool show_debug_window) { + const std::string& path, gtsam::Key& keyframe, + wpical::KeyframeMap& outputMap, bool show_debug_window) { + // clear inputs + outputMap.clear(); + if (show_debug_window) { cv::namedWindow("Processing Frame", cv::WINDOW_NORMAL); } @@ -331,15 +197,13 @@ inline bool process_video_file( return false; } + // Reuse mats if we can - allocatiosn are expensive cv::Mat frame; cv::Mat frame_gray; cv::Mat frame_debug; - int frame_num = 0; - while (video_input.read(frame)) { - std::cout << "Processing " << path << " - Frame " << frame_num++ - << std::endl; + std::cout << "Processing " << path << " - Frame " << keyframe << std::endl; // Convert color frame to grayscale frame cv::cvtColor(frame, frame_gray, cv::COLOR_BGR2GRAY); @@ -359,61 +223,41 @@ inline bool process_video_file( continue; } - // Find detection with the smallest tag ID - apriltag_detection_t* tag_detection_min = nullptr; - zarray_get(tag_detections, 0, &tag_detection_min); - - for (int i = 0; i < zarray_size(tag_detections); i++) { - apriltag_detection_t* tag_detection_i; - zarray_get(tag_detections, i, &tag_detection_i); - - if (tag_detection_i->id < tag_detection_min->id) { - tag_detection_min = tag_detection_i; - } - } - - Eigen::Matrix camera_to_tag_min = estimate_tag_pose( - tag_detection_min, camera_matrix, camera_distortion, tag_size); - - // Find transformation from smallest tag ID + std::vector tagsThisKeyframe; for (int i = 0; i < zarray_size(tag_detections); i++) { apriltag_detection_t* tag_detection_i; zarray_get(tag_detections, i, &tag_detection_i); - // Add tag ID to poses - if (poses.find(tag_detection_i->id) == poses.end()) { - poses[tag_detection_i->id] = {Eigen::Vector3d(0.0, 0.0, 0.0), - Eigen::Quaterniond(1.0, 0.0, 0.0, 0.0)}; + // Convert to our data type. I don't love how complicated this is. + auto atag = reinterpret_cast(tag_detection_i); + auto tag_corners_cv = std::vector{}; + for (int corn = 0; corn < 4; corn++) { + tag_corners_cv.emplace_back(atag->GetCorner(corn).x, + atag->GetCorner(corn).y); } - // Estimate camera to tag pose - Eigen::Matrix caamera_to_tag = estimate_tag_pose( - tag_detection_i, camera_matrix, camera_distortion, tag_size); + // Undistort so gtsam doesn't have to deal with distortion + cv::Mat camCalCv(camera_matrix.rows(), camera_matrix.cols(), CV_64F); + cv::Mat camDistCv(camera_distortion.rows(), camera_distortion.cols(), + CV_64F); - // Draw debug cube - if (show_debug_window) { - draw_tag_cube(frame_debug, caamera_to_tag, camera_matrix, - camera_distortion, tag_size); - } + cv::eigen2cv(camera_matrix, camCalCv); + cv::eigen2cv(camera_distortion, camDistCv); - // Skip finding transformation from smallest tag ID to itself - if (tag_detection_i->id == tag_detection_min->id) { - continue; - } + cv::undistortImagePoints(tag_corners_cv, tag_corners_cv, camCalCv, + camDistCv); - Eigen::Matrix tag_min_to_tag = - camera_to_tag_min.inverse() * caamera_to_tag; + TagDetection tag; + tag.id = atag->GetId(); - // Constraint - Constraint constraint; - constraint.id_begin = tag_detection_min->id; - constraint.id_end = tag_detection_i->id; - constraint.t_begin_end.p = tag_min_to_tag.block<3, 1>(0, 3); - constraint.t_begin_end.q = - Eigen::Quaterniond(tag_min_to_tag.block<3, 3>(0, 0)); + tag.corners.resize(4); + std::transform(tag_corners_cv.begin(), tag_corners_cv.end(), + tag.corners.begin(), + [](const auto& it) { return TargetCorner{it.x, it.y}; }); - constraints.push_back(constraint); + tagsThisKeyframe.push_back(tag); } + outputMap[keyframe] = tagsThisKeyframe; apriltag_detections_destroy(tag_detections); @@ -422,6 +266,9 @@ inline bool process_video_file( cv::imshow("Processing Frame", frame_debug); cv::waitKey(1); } + + // Keep track of the frame number + keyframe++; } video_input.release(); @@ -437,9 +284,9 @@ int fieldcalibration::calibrate(std::string input_dir_path, std::string camera_model_path, std::string ideal_map_path, int pinned_tag_id, bool show_debug_window) { - // Silence OpenCV logging - cv::utils::logging::setLogLevel( - cv::utils::logging::LogLevel::LOG_LEVEL_SILENT); + // // Silence OpenCV logging + // cv::utils::logging::setLogLevel( + // cv::utils::logging::LogLevel::LOG_LEVEL_SILENT); // Load camera model Eigen::Matrix3d camera_matrix; @@ -453,19 +300,25 @@ int fieldcalibration::calibrate(std::string input_dir_path, return 1; } - wpi::json json = wpi::json::parse(std::ifstream(ideal_map_path)); - if (!json.contains("tags")) { - return 1; - } + // Convert intrinsics to gtsam-land. Order fx fy s u0 v0 + gtsam::Cal3_S2 gtsam_cal{ + camera_matrix(0, 0), camera_matrix(1, 1), 0, + camera_matrix(0, 2), camera_matrix(1, 2), + }; - // Load ideal field map - std::map ideal_map; + // Load the seed field layout from disk + frc::AprilTagFieldLayout idealTagLayout; try { - ideal_map = load_ideal_map(ideal_map_path); + wpi::json json = wpi::json::parse(std::ifstream(ideal_map_path)); + idealTagLayout = json.get(); } catch (...) { + std::cerr << "Could not deserialize" << ideal_map_path << std::endl; return 1; } + // Load ideal field map + std::map ideal_map; + // Apriltag detector apriltag_detector_t* tag_detector = apriltag_detector_create(); tag_detector->nthreads = 8; @@ -473,23 +326,25 @@ int fieldcalibration::calibrate(std::string input_dir_path, apriltag_family_t* tag_family = tag36h11_create(); apriltag_detector_add_family(tag_detector, tag_family); - // Find tag poses - std::map, - Eigen::aligned_allocator>> - poses; - std::vector> constraints; + // Write down keyframes from all our video files + wpical::KeyframeMap outputMap; + + gtsam::Key keyframe{gtsam::symbol_shorthand::X(0)}; + + constexpr units::meter_t TAG_SIZE = 0.1651_m; for (const auto& entry : std::filesystem::directory_iterator(input_dir_path)) { + // Ignore hidden files if (entry.path().filename().string()[0] == '.') { continue; } const std::string path = entry.path().string(); - bool success = process_video_file(tag_detector, camera_matrix, - camera_distortion, tagSizeMeters, path, - poses, constraints, show_debug_window); + bool success = process_video_file( + tag_detector, camera_matrix, camera_distortion, TAG_SIZE.to(), + path, keyframe, outputMap, show_debug_window); if (!success) { std::cout << "Unable to process video " << path << std::endl; @@ -497,100 +352,32 @@ int fieldcalibration::calibrate(std::string input_dir_path, } } - // Build optimization problem - ceres::Problem problem; - ceres::Manifold* quaternion_manifold = new ceres::EigenQuaternionManifold; + wpical::GtsamApriltagMap layoutGuess{idealTagLayout, TAG_SIZE}; - for (const auto& constraint : constraints) { - auto pose_begin_iter = poses.find(constraint.id_begin); - auto pose_end_iter = poses.find(constraint.id_end); + // TODO - handle constraints more generally (ie, multiple, tunable) - ceres::CostFunction* cost_function = - PoseGraphError::Create(constraint.t_begin_end); + // Noise on our pose prior. order is rx, ry, rz, tx, ty, tz, and units are + // [rad] and [m]. + // Guess ~1 degree and 5 mm for fun. + using gtsam::Vector3; + using gtsam::Vector6; + Vector6 sigmas; + sigmas << Vector3::Constant(0.015), Vector3::Constant(0.005); + gtsam::SharedNoiseModel posePriorNoise = + gtsam::noiseModel::Diagonal::Sigmas(sigmas); + std::map> fixedTags{ + {pinned_tag_id, + {layoutGuess.WorldToTag(pinned_tag_id).value(), posePriorNoise}}}; - problem.AddResidualBlock(cost_function, nullptr, - pose_begin_iter->second.p.data(), - pose_begin_iter->second.q.coeffs().data(), - pose_end_iter->second.p.data(), - pose_end_iter->second.q.coeffs().data()); + // one pixel in u and v - TODO don't hardcode this + gtsam::SharedNoiseModel cameraNoise{ + gtsam::noiseModel::Isotropic::Sigma(2, 1.0)}; - problem.SetManifold(pose_begin_iter->second.q.coeffs().data(), - quaternion_manifold); - problem.SetManifold(pose_end_iter->second.q.coeffs().data(), - quaternion_manifold); - } - - // Pin tag - auto pinned_tag_iter = poses.find(pinned_tag_id); - if (pinned_tag_iter != poses.end()) { - problem.SetParameterBlockConstant(pinned_tag_iter->second.p.data()); - problem.SetParameterBlockConstant( - pinned_tag_iter->second.q.coeffs().data()); - } - - // Solve - ceres::Solver::Options options; - options.max_num_iterations = 200; - options.linear_solver_type = ceres::SPARSE_NORMAL_CHOLESKY; - options.num_threads = 10; - - ceres::Solver::Summary summary; - ceres::Solve(options, &problem, &summary); - - std::cout << summary.BriefReport() << std::endl; - - // Output - std::map observed_map = ideal_map; - - Eigen::Matrix correction_a; - correction_a << 0, 0, -1, 0, 1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 1; - - Eigen::Matrix correction_b; - correction_b << 0, 1, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 1; - - Eigen::Matrix pinned_tag_transform = - get_tag_transform(ideal_map, pinned_tag_id); - - for (const auto& [tag_id, pose] : poses) { - // Transformation from pinned tag - Eigen::Matrix transform = - Eigen::Matrix::Identity(); - - transform.block<3, 3>(0, 0) = pose.q.toRotationMatrix(); - transform.block<3, 1>(0, 3) = pose.p; - - // Transformation from world - Eigen::Matrix corrected_transform = - pinned_tag_transform * correction_a * transform * correction_b; - Eigen::Quaternion corrected_transform_q( - corrected_transform.block<3, 3>(0, 0)); - - observed_map[tag_id]["pose"]["translation"]["x"] = - corrected_transform(0, 3); - observed_map[tag_id]["pose"]["translation"]["y"] = - corrected_transform(1, 3); - observed_map[tag_id]["pose"]["translation"]["z"] = - corrected_transform(2, 3); - - observed_map[tag_id]["pose"]["rotation"]["quaternion"]["X"] = - corrected_transform_q.x(); - observed_map[tag_id]["pose"]["rotation"]["quaternion"]["Y"] = - corrected_transform_q.y(); - observed_map[tag_id]["pose"]["rotation"]["quaternion"]["Z"] = - corrected_transform_q.z(); - observed_map[tag_id]["pose"]["rotation"]["quaternion"]["W"] = - corrected_transform_q.w(); - } - - wpi::json observed_map_json; - - for (const auto& [tag_id, tag_json] : observed_map) { - observed_map_json["tags"].push_back(tag_json); - } + auto calResult = wpical::OptimizeLayout(layoutGuess, outputMap, gtsam_cal, + fixedTags, cameraNoise); - observed_map_json["field"] = { - {"length", static_cast(json.at("field").at("length"))}, - {"width", static_cast(json.at("field").at("width"))}}; + // Convert the output AprilTagFieldLayout to a json + wpi::json observed_map_json = calResult.optimizedLayout; std::ofstream output_file(output_file_path); output_file << observed_map_json.dump(4) << std::endl; diff --git a/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVariance.h b/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVariance.h new file mode 100644 index 00000000000..5f9694dce0c --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVariance.h @@ -0,0 +1,27 @@ +// Copyright (c) FIRST and other WPILib contributors. +// Open Source Software; you can modify and/or share it under the terms of +// the WPILib BSD license file in the root directory of this project. + +#pragma once + +#include +#include + +struct Pose3WithVariance { + frc::Pose3d pose; + // in order rx ry rz tx ty tz. Units are SI units + std::array covariance; + + static Pose3WithVariance FromEigen(frc::Pose3d pose, + Eigen::MatrixXd covariance) { + std::array cov; + + for (size_t i = 0; i < 6; i++) { + cov[i] = covariance(i); + } + + return {pose, cov}; + } +}; + +#include "Pose3WithVarianceStruct.h" diff --git a/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVarianceStruct.h b/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVarianceStruct.h new file mode 100644 index 00000000000..25a942b6468 --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/Pose3WithVarianceStruct.h @@ -0,0 +1,52 @@ +// Copyright (c) FIRST and other WPILib contributors. +// Open Source Software; you can modify and/or share it under the terms of +// the WPILib BSD license file in the root directory of this project. + +#pragma once + +#include "Pose3WithVariance.h" + +template <> +struct WPILIB_DLLEXPORT wpi::Struct { + static constexpr size_t poseSize = wpi::Struct::GetSize(); + + static constexpr std::string_view GetTypeName() { + return "Pose3WithVariance"; + } + + static constexpr size_t GetSize() { return poseSize + (6 * sizeof(double)); } + + static constexpr std::string_view GetSchema() { + return "Pose3d pose; double covariance[6]"; + } + + static Pose3WithVariance Unpack(std::span data) { + return Pose3WithVariance{ + wpi::UnpackStruct(data), + { + wpi::UnpackStruct(data), + wpi::UnpackStruct(data), + wpi::UnpackStruct(data), + wpi::UnpackStruct(data), + wpi::UnpackStruct(data), + wpi::UnpackStruct(data), + }}; + } + + static void Pack(std::span data, const Pose3WithVariance& value) { + wpi::PackStruct<0>(data, value.pose); + wpi::PackStruct(data, value.covariance[0]); + wpi::PackStruct(data, value.covariance[1]); + wpi::PackStruct(data, value.covariance[2]); + wpi::PackStruct(data, value.covariance[3]); + wpi::PackStruct(data, value.covariance[4]); + wpi::PackStruct(data, value.covariance[5]); + } + + static void ForEachNested( + std::invocable auto fn) { + wpi::ForEachStructSchema(fn); + } +}; + +static_assert(wpi::StructSerializable); diff --git a/wpical/src/main/native/cpp/gtsam_meme/README.md b/wpical/src/main/native/cpp/gtsam_meme/README.md new file mode 100644 index 00000000000..16b6c811e5f --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/README.md @@ -0,0 +1,127 @@ +# SFM Mapper + +## Resources + +- Factor Graphs and GTSAM. https://gtsam.org/tutorials/intro.html#magicparlabel-65377 +- Factor Graphs for Robot Perception. Dellaert & Kaess. https://www.cs.cmu.edu/~kaess/pub/Dellaert17fnt.pdf +- An Introduction to Factor Graphs. Loeliger. https://people.binf.ku.dk/~thamelry/MLSB08/hal.pdf (a good slide deck) + +The TLDR of the TLDR (really. Go skim at least [Factor Graphs and GTSAM](https://gtsam.org/tutorials/intro.html#magicparlabel-65411)) is that a factor graph is a graph of variables (things we want to optimize, like robot poses or landmark pose) connected by "factors" which describe relationships between our variables. For example, this factor graph shows 3 robot state variables (x1, x2, and x3; the white circles). The relationship between these states is described by factors (black filled in circles) which encode information about robot pose delta between states (in the backbone), and an external measurement from a sensor like GPS. + +![](https://gtsam.org/tutorials/intro-images/5_Users_dellaert_git_github_doc_images_FactorGraph2.png) + +The primary difference between this and tradidional nonlinear least squares is that we don't have to explicitly build a cost function. Instead, we're interested in finding the maximum likelihood solution, given a factor graph and *noise models* (see Dellaert Ch. 2.2), which describe how sure we are about measurements. + +Terminology +- Key: A unique identifier for a state, such as X0, L0, etc. Really a size_t in a typedef. +- Expression: an autodiff expression. Can be constant or constructed using a Key + +## Problem Formulation + +We build up a factor graph of landmarks (tags), and estimated camera poses in the world. Our decision variables are: +- Poses of tags in the world (located sufficiently near an initial guess that things don't diverge) + - This is the pose of the tag origin. The 4 tag corners are offset from this tag origin by +- 3 inches in x and z. +- Poses from where camera observations were recorded from + +Our factors: +- From camera observation pose -> each tag corner we can see (in pixels). Noise model is is tag corner location in pixels. The tag corner is a GTSAM "expression" that is (tag origin + (tag origin -> corner N)) for each corner. +- A "pose prior factor" on up to N many tags describing how sure we are in its pose in SE(3). In this example, we measured tag 8's location with a tape measure, but didn't measure any other ones. We could add prior factors to as many tags as we want - GTSAM will maximize the overall solution likelihood given this information. + + +```mermaid +graph TD; + x0[Observation 0] + x1[Observation 1] + x2[Observation 2] + x3[Observation 3] + + l6[Tag 6 Corners] + l7[Tag 7 Corners] + l8[Tag 8 Corners] + l9[Tag 9 Corners] + l10[Tag 10 Corners] + l15[Tag 15 Corners] + + l6o[Tag 6 Origin] + l7o[Tag 7 Origin] + l8o[Tag 8 Origin] + l9o[Tag 9 Origin] + l10o[Tag 10 Origin] + l15o[Tag 15 Origin] + + prior(Tag 8 pose prior factor) + + l6 --- l6o + l7 --- l7o + l8 --- l8o + l9 --- l9o + l10 --- l10o + l15 --- l15o + + x0 <-- F0 --> l6 + x0 <-- F1 --> l7 + x0 <-- F2 --> l8 + + x1 <-- F3 --> l6 + x1 <-- F4 --> l7 + x1 <-- F5 --> l8 + + x2 <-- F6 --> l7 + x2 <-- F7 --> l8 + x2 <-- F8 --> l9 + x2 <-- F9 --> l10 + + x3 <-- F10 --> l9 + x3 <-- F11 --> l10 + x3 <-- F12 --> l15 + + prior --- l8o +``` + +In code this ends up boiling down to something like this for tag observations: + +```cpp +// Get a list of Expressions for each corner's location in the field given an +// Expression for the tag origin pose in the field +vector WorldToCornersFactor(Pose3_ worldTtag) { + vector out; + for (const auto &p : tagToCorners) { + out.push_back(gtsam::transformFrom(worldTtag, p)); + } + + return out; +} + +// later + +Isotropic::shared_ptr cameraNoise{ + Isotropic::Sigma(2, 1.0)}; // one pixel in u and v + +// Add all our tag observation factors +for (const auto &[stateKey, tags] : keyframes) { + for (const TagDetection &tag : tags) { + // Where our tag's corners are in the field + auto worldPcorners = TagModel::WorldToCornersFactor(L(tag.id)); + + // add all of the tag's corners + constexpr int NUM_CORNERS = 4; + for (size_t i = 0; i < NUM_CORNERS; i++) { + // Decision variable - where our camera is in the world + const Pose3_ worldTbody_fac(stateKey); + + // Where we'd predict the i'th corner of the tag to be, + // given camera located at (worldTbody_fac * robotTcamera) observing worldPcorners[i] + const auto prediction = PredictLandmarkImageLocationFactor( + worldTbody_fac, robotTcamera, cameraCal, worldPcorners[i]); + + // where we saw the i'th corner in the image + Point2 measurement = {tag.corners[i].x, tag.corners[i].y}; + + // Add this prediction/measurement pair to our graph + graph.addExpressionFactor(prediction, measurement, cameraNoise); + } + } +} +``` + +And... that's about it! We just need to provide a sane initial guess for the solver (I used solvepnp for camera poses, and the starting tag map for tag poses), as well as add prior factors to at least one tag to anchor our map, and throw this at Dogleg or L-M or another traditional nonlinear optimization algorithm. GTSAM will take care of converting our factor graph into a Bayes net (Dellaert, ch 1.3) and then maximizing the product of all factor graph potentials. diff --git a/wpical/src/main/native/cpp/gtsam_meme/TagDetection.h b/wpical/src/main/native/cpp/gtsam_meme/TagDetection.h new file mode 100644 index 00000000000..cc339590482 --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/TagDetection.h @@ -0,0 +1,22 @@ +// Copyright (c) FIRST and other WPILib contributors. +// Open Source Software; you can modify and/or share it under the terms of +// the WPILib BSD license file in the root directory of this project. + +#pragma once + +#include + +#include +#include + +struct TargetCorner { + double x; + double y; +}; + +struct TagDetection { + int32_t id; + std::vector corners; + + inline int GetFiducialId() const { return id; } +}; diff --git a/wpical/src/main/native/cpp/gtsam_meme/TagDetectionStruct.h b/wpical/src/main/native/cpp/gtsam_meme/TagDetectionStruct.h new file mode 100644 index 00000000000..6429e73effa --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/TagDetectionStruct.h @@ -0,0 +1,58 @@ +// Copyright (c) FIRST and other WPILib contributors. +// Open Source Software; you can modify and/or share it under the terms of +// the WPILib BSD license file in the root directory of this project. + +#pragma once + +#include +#include + +#include "TagDetection.h" + +template <> +struct WPILIB_DLLEXPORT wpi::Struct { + static constexpr std::string_view GetTypeName() { return "TagDetection"; } + + static constexpr size_t GetSize() { return ((8 * 2) * 4 + 4); } + + static constexpr std::string_view GetSchema() { + return "uint32 id;double cx1;double cy1;double cx2;double cy2;double " + "cx3;double cy3;double cx4;double cy4"; + } + + static TagDetection Unpack(std::span data) { + return TagDetection{wpi::UnpackStruct(data), + { + { + wpi::UnpackStruct(data), + wpi::UnpackStruct(data), + }, + { + wpi::UnpackStruct(data), + wpi::UnpackStruct(data), + }, + { + wpi::UnpackStruct(data), + wpi::UnpackStruct(data), + }, + { + wpi::UnpackStruct(data), + wpi::UnpackStruct(data), + }, + }}; + } + + static void Pack(std::span data, const TagDetection& value) { + wpi::PackStruct<0>(data, value.id); + wpi::PackStruct<4 + 8 * 0>(data, value.corners[0].x); + wpi::PackStruct<4 + 8 * 1>(data, value.corners[0].y); + wpi::PackStruct<4 + 8 * 2>(data, value.corners[1].x); + wpi::PackStruct<4 + 8 * 3>(data, value.corners[1].y); + wpi::PackStruct<4 + 8 * 4>(data, value.corners[2].x); + wpi::PackStruct<4 + 8 * 5>(data, value.corners[2].y); + wpi::PackStruct<4 + 8 * 6>(data, value.corners[3].x); + wpi::PackStruct<4 + 8 * 7>(data, value.corners[3].y); + } +}; + +static_assert(wpi::StructSerializable); diff --git a/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.cpp b/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.cpp new file mode 100644 index 00000000000..80815e9f5d4 --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.cpp @@ -0,0 +1,116 @@ +// Copyright (c) FIRST and other WPILib contributors. +// Open Source Software; you can modify and/or share it under the terms of +// the WPILib BSD license file in the root directory of this project. + +#include "gtsam_tag_map.h" + +#include +#include + +#include +#include + +#include "pose_converters.h" + +using namespace gtsam; +using namespace wpical; + +static std::map TagLayoutToMap( + const frc::AprilTagFieldLayout& layout) { + std::map worldTtags; + + for (const frc::AprilTag& tag : layout.GetTags()) { + worldTtags[tag.ID] = Pose3dToGtsamPose3(tag.pose); + } + + return worldTtags; +} + +static std::vector MakeTagModel(double width) { + return { + {0, -width / 2.0, -width / 2.0}, + {0, width / 2.0, -width / 2.0}, + {0, width / 2.0, width / 2.0}, + {0, -width / 2.0, width / 2.0}, + }; +} +static std::vector MakeTagModelOpenCV(double width) { + return {cv::Point3f(0, -width / 2.0, -width / 2.0), + cv::Point3f(0, +width / 2.0, -width / 2.0), + cv::Point3f(0, +width / 2.0, +width / 2.0), + cv::Point3f(0, -width / 2.0, +width / 2.0)}; +} + +wpical::GtsamApriltagMap::GtsamApriltagMap( + const frc::AprilTagFieldLayout& layout, units::meter_t tagWidth) + : tagToCorners{MakeTagModel(tagWidth.to())}, + tagToCornersCv{MakeTagModelOpenCV(tagWidth.to())}, + worldTtags{TagLayoutToMap(layout)} {} + +const std::optional wpical::GtsamApriltagMap::WorldToTag( + const int id) const { + if (auto it = worldTtags.find(id); it != worldTtags.end()) { + return it->second; + } else { + return std::nullopt; + } +} + +const std::vector wpical::GtsamApriltagMap::WorldToCornersFactor( + const Pose3_ worldTtag) const { + std::vector out; + out.reserve(tagToCorners.size()); + for (const auto& p : tagToCorners) { + out.push_back(transformFrom(worldTtag, p)); + } + + return out; +} + +gtsam::Point2_ wpical::PredictLandmarkImageLocationFactor( + gtsam::Pose3_ worldTcamera_fac, gtsam::Cal3_S2_ cameraCal, + gtsam::Point3_ worldPcorner) { + // Tag corner 3D position in the camera frame + const Point3_ camPcorner = transformTo(worldTcamera_fac, worldPcorner); + // project from vector down to pinhole location, then uncalibrate to pixel + // locations + const Point2_ prediction = + uncalibrate(cameraCal, project(camPcorner)); + + return prediction; +} + +std::optional wpical::EstimateWorldTCam_SingleTag( + TagDetection tagToUse, wpical::GtsamApriltagMap aprilTags, + CameraMatrix camMat) { + Eigen::Matrix distCoeffs = wpical::DistortionMatrix::Zero(); + + const std::vector& objectPoints = aprilTags.TagToCornersCv(); + + std::vector imagePoints; + imagePoints.reserve(4); + for (const auto& corner : tagToUse.corners) { + imagePoints.emplace_back(corner.x, corner.y); + } + + // eigen/cv marshalling + cv::Mat cameraMatCV(camMat.rows(), camMat.cols(), cv::DataType::type); + cv::eigen2cv(camMat, cameraMatCV); + cv::Mat distCoeffsMatCV(distCoeffs.rows(), distCoeffs.cols(), + cv::DataType::type); + cv::eigen2cv(distCoeffs, distCoeffsMatCV); + + // actually do solvepnp + cv::Mat rvec(3, 1, cv::DataType::type); + cv::Mat tvec(3, 1, cv::DataType::type); + cv::solvePnP(objectPoints, imagePoints, cameraMatCV, distCoeffsMatCV, rvec, + tvec); + + if (const auto w2tag = aprilTags.WorldToTag(tagToUse.id)) { + auto camToTag = Pose3dToGtsamPose3(ToPose3d(tvec, rvec)); + auto tag2cam = camToTag.inverse(); + return w2tag.value().transformPoseFrom(tag2cam); + } else { + return std::nullopt; + } +} diff --git a/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.h b/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.h new file mode 100644 index 00000000000..76af7c6e120 --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/gtsam_tag_map.h @@ -0,0 +1,75 @@ +// Copyright (c) FIRST and other WPILib contributors. +// Open Source Software; you can modify and/or share it under the terms of +// the WPILib BSD license file in the root directory of this project. + +#pragma once +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "TagDetection.h" + +namespace wpical { +using CameraMatrix = Eigen::Matrix; +using DistortionMatrix = Eigen::Matrix; + +class GtsamApriltagMap { + public: + GtsamApriltagMap(const frc::AprilTagFieldLayout& layout, + units::meter_t tagWidth); + + /** + * The pose of tag {id} in the field map, or empty if not found. + */ + const std::optional WorldToTag(const int id) const; + + /* + * List of corners mapped from 3d space (meters) to the 2d camera screen + * (pixels). + * ⟶ +X 3 ----- 2 + * | | | + * V | | + * +Y 0 ----- 1 + * In object space, we have this order, with origin at the center and +X out + * of the tag + */ + inline const std::vector& TagToCornersCv() const { + return tagToCornersCv; + } + + /** + * A list of autodiff Expressions for the location of this tag's 4 corners in + * the field, given an Expression for the tag origin's pose. + */ + const std::vector WorldToCornersFactor( + const gtsam::Pose3_ worldTtag) const; + + private: + std::vector tagToCorners; + std::vector tagToCornersCv; + std::map worldTtags; +}; + +/** + * Create an Expression for where we would see {worldPcorner} in an image + * captured by a camera with {cameraCal} located at {worldTcamera_fac}. + */ +gtsam::Point2_ PredictLandmarkImageLocationFactor( + gtsam::Pose3_ worldTcamera_fac, gtsam::Cal3_S2_ cameraCal, + gtsam::Point3_ worldPcorner); + +/** + * Generate a seed pose for our camera, given a single tag detection. Note that + * tag corners must be externally undistorted. + */ +std::optional EstimateWorldTCam_SingleTag( + TagDetection result, GtsamApriltagMap aprilTags, CameraMatrix camMat); + +} // namespace wpical diff --git a/wpical/src/main/native/cpp/gtsam_meme/pose_converters.cpp b/wpical/src/main/native/cpp/gtsam_meme/pose_converters.cpp new file mode 100644 index 00000000000..bfc90bcd18c --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/pose_converters.cpp @@ -0,0 +1,44 @@ +// Copyright (c) FIRST and other WPILib contributors. +// Open Source Software; you can modify and/or share it under the terms of +// the WPILib BSD license file in the root directory of this project. + +#include "pose_converters.h" + +using namespace wpical; +using namespace gtsam; + +Pose3 wpical::Pose3dToGtsamPose3(frc::Pose3d pose) { + const auto q = pose.Rotation().GetQuaternion(); + return Pose3{Rot3(q.W(), q.X(), q.Y(), q.Z()), + Point3(pose.X().to(), pose.Y().to(), + pose.Z().to())}; +} + +frc::Pose3d wpical::GtsamToFrcPose3d(gtsam::Pose3 pose) { + return frc::Pose3d{ + frc::Translation3d{units::meter_t{pose.x()}, units::meter_t{pose.y()}, + units::meter_t{pose.z()}}, + frc::Rotation3d{pose.rotation().matrix()}}; +} + +frc::Pose3d wpical::ToPose3d(const cv::Mat& tvec, const cv::Mat& rvec) { + using namespace frc; + using namespace units; + + // cv::Mat R; + // cv::Rodrigues(rvec, R); // R is 3x3 + // R = R.t(); // rotation of inverse + // cv::Mat tvecI = -R * tvec; // translation of inverse + + Eigen::Matrix tv; + tv[0] = tvec.at(0, 0); + tv[1] = tvec.at(1, 0); + tv[2] = tvec.at(2, 0); + Eigen::Matrix rv; + rv[0] = rvec.at(0, 0); + rv[1] = rvec.at(1, 0); + rv[2] = rvec.at(2, 0); + + return Pose3d(Translation3d(meter_t{tv[0]}, meter_t{tv[1]}, meter_t{tv[2]}), + Rotation3d(rv)); +} diff --git a/wpical/src/main/native/cpp/gtsam_meme/pose_converters.h b/wpical/src/main/native/cpp/gtsam_meme/pose_converters.h new file mode 100644 index 00000000000..a224bf37938 --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/pose_converters.h @@ -0,0 +1,16 @@ +// Copyright (c) FIRST and other WPILib contributors. +// Open Source Software; you can modify and/or share it under the terms of +// the WPILib BSD license file in the root directory of this project. + +#pragma once + +#include + +#include +#include + +namespace wpical { +gtsam::Pose3 Pose3dToGtsamPose3(frc::Pose3d pose); +frc::Pose3d GtsamToFrcPose3d(gtsam::Pose3 pose); +frc::Pose3d ToPose3d(const cv::Mat& tvec, const cv::Mat& rvec); +} // namespace wpical diff --git a/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.cpp b/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.cpp new file mode 100644 index 00000000000..47a4d26bf77 --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.cpp @@ -0,0 +1,218 @@ +// Copyright (c) FIRST and other WPILib contributors. +// Open Source Software; you can modify and/or share it under the terms of +// the WPILib BSD license file in the root directory of this project. + +#include "wpical_gtsam.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "pose_converters.h" + +// #define OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT +#include + +using namespace wpical; +using namespace gtsam; +using symbol_shorthand::L; +using symbol_shorthand::X; + +using CameraMatrix = Eigen::Matrix; +using DistortionMatrix = Eigen::Matrix; + +/** + * Check if a given tag {id} was seen in a + */ +std::set TagsUsed(KeyframeMap tags) { + std::vector v; + for (const auto& [key, tags] : tags) { + for (const TagDetection& tag : tags) { + v.push_back(tag.id); + } + } + return {v.begin(), v.end()}; +} + +CalResult wpical::OptimizeLayout( + const GtsamApriltagMap& tagLayoutGuess, KeyframeMap keyframes, + gtsam::Cal3_S2 cal, + const std::map>& + fixedTags, + const gtsam::SharedNoiseModel cameraNoise) { + ExpressionFactorGraph graph; + + // Make sure our tags are all in the map + for (auto& [stateKey, tags] : keyframes) { + tags.erase(std::remove_if(tags.begin(), tags.end(), + [&](const auto& tag) { + bool ret = !tagLayoutGuess.WorldToTag(tag.id); + std::cout << "Checking tag " << tag.id + << " -- got " << ret << std::endl; + return ret; + }), + tags.end()); + } + std::erase_if(keyframes, [](auto& kv) { return kv.second.empty(); }); + + // constrain fixed(ish) tags - future work can investigate partial pose priors + for (const auto& [tagId, info] : fixedTags) { + graph.addPrior(L(tagId), std::get<0>(info), std::get<1>(info)); + } + + // Add all our tag observation factors + for (const auto& [stateKey, tags] : keyframes) { + for (const TagDetection& tag : tags) { + std::cout << "Adding factors for tag " << tag.id << std::endl; + auto worldPcorners = + tagLayoutGuess.WorldToCornersFactor(Pose3_{L(tag.id)}); + + // add each tag corner + constexpr int NUM_CORNERS = 4; + for (size_t i = 0; i < NUM_CORNERS; i++) { + // Decision variable - where our camera is in the world + const Pose3_ worldTcamera_fac(stateKey); + // Where we'd predict the i'th corner of the tag to be + const auto prediction = PredictLandmarkImageLocationFactor( + worldTcamera_fac, cal, worldPcorners[i]); + // where we saw the i'th corner in the image + Point2 measurement = {tag.corners[i].x, tag.corners[i].y}; + // Add this prediction/measurement pair to our graph + graph.addExpressionFactor(prediction, measurement, cameraNoise); + } + } + } + + // Initial guess for our optimizer. Needs to be in the right ballpark, but + // accuracy doesn't super matter + Values initial; + + // Guess for all camera poses based on tag layout JSON + for (const auto& [stateKey, tags] : keyframes) { + if (!tags.size()) { + std::cerr << "Can't guess pose of camera for observation " << stateKey + << " (no tags in observation)" << std::endl; + continue; + } + + auto worldTcam_guess = + EstimateWorldTCam_SingleTag(*tags.begin(), tagLayoutGuess, cal.K()); + if (!worldTcam_guess) { + std::cerr << "Can't guess pose of camera for observation " << stateKey + << std::endl; + } else { + initial.insert(stateKey, *worldTcam_guess); + } + } + + // Guess for tag locations = tag layout json + for (int id : TagsUsed(keyframes)) { + if (auto worldTtag = tagLayoutGuess.WorldToTag(id)) { + initial.insert(L(id), worldTtag.value()); + } else { + std::cerr << "Can't guess pose of tag " << id << std::endl; + } + } + + /* Optimize the graph and print results */ + std::cout << "==========================\ninitial error = " + << graph.error(initial) << std::endl; + auto start = std::chrono::steady_clock::now(); + + DoglegParams params; + params.verbosity = NonlinearOptimizerParams::ERROR; + // params.relativeErrorTol = 1e-3; + // params.absoluteErrorTol = 1e-3; + + // Create initial optimizer + DoglegOptimizer optimizer{graph, initial, params}; + + // Run full optimization until convergence. + Values result; + try { + result = optimizer.optimize(); + } catch (std::exception* e) { + std::cerr << e->what(); + return {}; + } + + auto end = std::chrono::steady_clock::now(); + auto dt = end - start; + uint64_t microseconds = + std::chrono::duration_cast(dt).count(); + + std::cout << "\n===== Converged in " << optimizer.iterations() + << " iterations (" << microseconds << " uS) with final error " + << optimizer.error() << " ======" << std::endl; + + { + std::stringstream ss; + ss << "tag_map_" << result.size() << ".dot"; + graph.saveGraph(ss.str(), result); + } + + CalResult ret; + ret.result = result; + + { + gtsam::Marginals marginals(graph, result); + std::vector tags; + + std::cout << "Results:" << std::endl; + for (auto [key, value] : result) { + std::cout << "\n========= Key " << gtsam::Symbol(key) << " ==========\n"; + + // Assume all our keys are pose3 factors. lol. Print out some fun info + // about them + auto est = GtsamToFrcPose3d(result.at(key)); + fmt::println("Estimated pose:"); + fmt::println("Translation: x={:.2f} y={:.2f} z={:.2f}", est.X(), est.Y(), + est.Z()); + fmt::println("Rotation: W={:.3f} X={:.3f} Y={:.3f} Z={:.3f}", + est.Rotation().GetQuaternion().W(), + est.Rotation().GetQuaternion().X(), + est.Rotation().GetQuaternion().Y(), + est.Rotation().GetQuaternion().Z()); + + // Covariance is the variance of x_i with x_i - stddev is sqrt(var) + gtsam::Matrix marginalCov = marginals.marginalCovariance(key); + auto stddev = marginalCov.diagonal().cwiseSqrt(); + + std::cout << "Marginal stddev (r t):" << std::endl << stddev << std::endl; + + // todo - track all tag keys instead of this hack + if (key >= L(0) && key <= L(1000000)) { + int id = static_cast(key - L(0)); + + tags.push_back(frc::AprilTag{id, est}); + ret.tagPoseCovariances[id] = Pose3WithVariance::FromEigen(est, stddev); + } + if (key >= X(0) && key <= X(1000000)) { + ret.cameraPoseCovariances[key] = + Pose3WithVariance::FromEigen(est, stddev); + } + } + + frc::AprilTagFieldLayout layout{tags, 16.541_m, 8.211_m}; + ret.optimizedLayout = layout; + } + + return ret; +} diff --git a/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.h b/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.h new file mode 100644 index 00000000000..60ff0ef2dcc --- /dev/null +++ b/wpical/src/main/native/cpp/gtsam_meme/wpical_gtsam.h @@ -0,0 +1,41 @@ +// Copyright (c) FIRST and other WPILib contributors. +// Open Source Software; you can modify and/or share it under the terms of +// the WPILib BSD license file in the root directory of this project. + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "Pose3WithVariance.h" +#include "TagDetection.h" +#include "gtsam_tag_map.h" + +namespace wpical { +using KeyframeMap = std::map>; + +struct CalResult { + gtsam::Values result; + + std::map tagPoseCovariances; + std::map cameraPoseCovariances; + + frc::AprilTagFieldLayout optimizedLayout; +}; + +// note that we expect the pixel locations to be -undistorted- here +CalResult OptimizeLayout( + const GtsamApriltagMap& tagLayoutGuess, KeyframeMap keyframes, + gtsam::Cal3_S2 cal, + const std::map>& + fixedTags, + const gtsam::SharedNoiseModel cameraNoise); +} // namespace wpical diff --git a/wpical/src/test/native/cpp/test_calibrate.cpp b/wpical/src/test/native/cpp/test_calibrate.cpp index c185face6f8..b9d7bf3d715 100644 --- a/wpical/src/test/native/cpp/test_calibrate.cpp +++ b/wpical/src/test/native/cpp/test_calibrate.cpp @@ -26,64 +26,64 @@ const std::string videoLocation = "/altfieldvideo"; const std::string fileSuffix = ".mp4"; const std::string videoLocation = "/fieldvideo"; #endif -TEST(Camera_CalibrationTest, OpenCV_Typical) { - int ret = cameracalibration::calibrate( - projectRootPath + "/testcalibration" + fileSuffix, cameraModel, 0.709f, - 0.551f, 12, 8, false); - cameracalibration::dumpJson(cameraModel, - calSavePath + "/cameracalibration.json"); - EXPECT_EQ(ret, 0); -} +// TEST(Camera_CalibrationTest, OpenCV_Typical) { +// int ret = cameracalibration::calibrate( +// projectRootPath + "/testcalibration" + fileSuffix, 0.709f, 0.551f, 12, +// 8, false); +// EXPECT_EQ(ret, 0); +// } -TEST(Camera_CalibrationTest, OpenCV_Atypical) { - int ret = cameracalibration::calibrate( - projectRootPath + videoLocation + "/long" + fileSuffix, cameraModel, - 0.709f, 0.551f, 12, 8, false); - EXPECT_EQ(ret, 1); -} +// TEST(Camera_CalibrationTest, OpenCV_Atypical) { +// int ret = cameracalibration::calibrate( +// projectRootPath + videoLocation + "/long" + fileSuffix, 0.709f, 0.551f, +// 12, 8, false); +// EXPECT_EQ(ret, 1); +// } -TEST(Camera_CalibrationTest, MRcal_Typical) { - int ret = cameracalibration::calibrate( - projectRootPath + "/testcalibration" + fileSuffix, cameraModel, .709f, 12, - 8, 1080, 1920, false); - EXPECT_EQ(ret, 0); -} +// TEST(Camera_CalibrationTest, MRcal_Typical) { +// int ret = cameracalibration::calibrate( +// projectRootPath + "/testcalibration" + fileSuffix, 0.709f, 12, 8, 1080, +// 1920, false); +// EXPECT_EQ(ret, 0); +// } -TEST(Camera_CalibrationTest, MRcal_Atypical) { - int ret = cameracalibration::calibrate( - projectRootPath + videoLocation + "/short" + fileSuffix, cameraModel, - 0.709f, 12, 8, 1080, 1920, false); - EXPECT_EQ(ret, 1); -} +// TEST(Camera_CalibrationTest, MRcal_Atypical) { +// int ret = cameracalibration::calibrate( +// projectRootPath + videoLocation + "/short" + fileSuffix, 0.709f, 12, 8, +// 1080, 1920, false); +// EXPECT_EQ(ret, 1); +// } TEST(Field_CalibrationTest, Typical) { int ret = fieldcalibration::calibrate( - projectRootPath + videoLocation, calSavePath, - calSavePath + "/cameracalibration.json", + "/home/matt/Videos/wpical-testing", + projectRootPath + "/2024-output-typical.json", + "/home/matt/Downloads/" + "photon_calibration_Arducam_OV2311_USB_Camera_1600x1200.json", projectRootPath + "/2024-crescendo.json", 3, false); EXPECT_EQ(ret, 0); } -TEST(Field_CalibrationTest, Atypical_Bad_Camera_Model_Directory) { - int ret = fieldcalibration::calibrate( - projectRootPath + videoLocation, calSavePath, - projectRootPath + videoLocation + "/long" + fileSuffix, - projectRootPath + "/2024-crescendo.json", 3, false); - EXPECT_EQ(ret, 1); -} +// TEST(Field_CalibrationTest, Atypical_Bad_Camera_Model_Directory) { +// int ret = fieldcalibration::calibrate( +// projectRootPath + videoLocation, projectRootPath + "", +// projectRootPath + videoLocation + "/long" + fileSuffix, +// projectRootPath + "/2024-crescendo.json", 3, false); +// EXPECT_EQ(ret, 1); +// } -TEST(Field_CalibrationTest, Atypical_Bad_Ideal_JSON) { - int ret = fieldcalibration::calibrate( - projectRootPath + videoLocation, calSavePath, - calSavePath + "/cameracalibration.json", - calSavePath + "/cameracalibration.json", 3, false); - EXPECT_EQ(ret, 1); -} +// TEST(Field_CalibrationTest, Atypical_Bad_Ideal_JSON) { +// int ret = fieldcalibration::calibrate( +// projectRootPath + videoLocation, projectRootPath + "", +// projectRootPath + "/camera_calibration.json", +// projectRootPath + "/camera_calibration.json", 3, false); +// EXPECT_EQ(ret, 1); +// } -TEST(Field_CalibrationTest, Atypical_Bad_Input_Directory) { - int ret = fieldcalibration::calibrate( - projectRootPath + "", calSavePath, - calSavePath + "/cameracalibration.json", - projectRootPath + "/2024-crescendo.json", 3, false); - EXPECT_EQ(ret, 1); -} +// TEST(Field_CalibrationTest, Atypical_Bad_Input_Directory) { +// int ret = fieldcalibration::calibrate( +// projectRootPath + "", projectRootPath + "", +// projectRootPath + "/camera_calibration.json", +// projectRootPath + "/2024-crescendo.json", 3, false); +// EXPECT_EQ(ret, 1); +// } diff --git a/wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense b/wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense new file mode 100644 index 00000000000..5768910bd88 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/Eigen/Dense @@ -0,0 +1,7 @@ +#include "Core" +#include "LU" +#include "Cholesky" +#include "QR" +#include "SVD" +#include "Geometry" +#include "Eigenvalues" diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor new file mode 100644 index 00000000000..6ac89f1b67e --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/Tensor @@ -0,0 +1,142 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// #ifndef EIGEN_CXX11_TENSOR_MODULE_H +#define EIGEN_CXX11_TENSOR_MODULE_H + +#include "../../../Eigen/Core" + +#include "../SpecialFunctions" + +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" + +// IWYU pragma: begin_exports +#include "../../../Eigen/src/Core/util/Meta.h" +#include "../../../Eigen/src/Core/util/MaxSizeVector.h" +// IWYU pragma: end_exports + +/** \defgroup CXX11_Tensor_Module Tensor Module + * + * This module provides a Tensor class for storing arbitrarily indexed + * objects. + * + * \code + * #include + * \endcode + * + * Much of the documentation can be found \ref eigen_tensors "here". + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) +// #include "ThreadPool" +#endif + +#ifdef EIGEN_USE_GPU +#include +#if defined(EIGEN_USE_HIP) +#include +#else +#include +#endif +#endif + +// IWYU pragma: begin_exports +#include "src/Tensor/TensorMacros.h" +#include "src/Tensor/TensorForwardDeclarations.h" +#include "src/Tensor/TensorMeta.h" +#include "src/Tensor/TensorFunctors.h" +#include "src/Tensor/TensorCostModel.h" +#include "src/Tensor/TensorDeviceDefault.h" +#include "src/Tensor/TensorDeviceThreadPool.h" +#include "src/Tensor/TensorDeviceGpu.h" +#ifndef gpu_assert +#define gpu_assert(x) +#endif +#include "src/Tensor/TensorDeviceSycl.h" +#include "src/Tensor/TensorIndexList.h" +#include "src/Tensor/TensorDimensionList.h" +#include "src/Tensor/TensorDimensions.h" +#include "src/Tensor/TensorInitializer.h" +#include "src/Tensor/TensorTraits.h" +#include "src/Tensor/TensorRandom.h" +#include "src/Tensor/TensorUInt128.h" +#include "src/Tensor/TensorIntDiv.h" +#include "src/Tensor/TensorGlobalFunctions.h" + +#include "src/Tensor/TensorIO.h" + +#include "src/Tensor/TensorBase.h" +#include "src/Tensor/TensorBlock.h" + +#include "src/Tensor/TensorEvaluator.h" +#include "src/Tensor/TensorExpr.h" +#include "src/Tensor/TensorReduction.h" +#include "src/Tensor/TensorReductionGpu.h" +#include "src/Tensor/TensorArgMax.h" +#include "src/Tensor/TensorConcatenation.h" +#include "src/Tensor/TensorContractionMapper.h" +#include "src/Tensor/TensorContractionBlocking.h" +#include "src/Tensor/TensorContraction.h" +#include "src/Tensor/TensorContractionThreadPool.h" +#include "src/Tensor/TensorContractionGpu.h" +#include "src/Tensor/TensorConversion.h" +#include "src/Tensor/TensorConvolution.h" +#include "src/Tensor/TensorFFT.h" +#include "src/Tensor/TensorPatch.h" +#include "src/Tensor/TensorImagePatch.h" +#include "src/Tensor/TensorVolumePatch.h" +#include "src/Tensor/TensorBroadcasting.h" +#include "src/Tensor/TensorChipping.h" +#include "src/Tensor/TensorInflation.h" +#include "src/Tensor/TensorLayoutSwap.h" +#include "src/Tensor/TensorMorphing.h" +#include "src/Tensor/TensorPadding.h" +#include "src/Tensor/TensorReverse.h" +#include "src/Tensor/TensorRoll.h" +#include "src/Tensor/TensorShuffling.h" +#include "src/Tensor/TensorStriding.h" +#include "src/Tensor/TensorCustomOp.h" +#include "src/Tensor/TensorEvalTo.h" +#include "src/Tensor/TensorForcedEval.h" +#include "src/Tensor/TensorGenerator.h" +#include "src/Tensor/TensorAssign.h" +#include "src/Tensor/TensorScan.h" +#include "src/Tensor/TensorTrace.h" + +#ifdef EIGEN_USE_SYCL +#include "src/Tensor/TensorReductionSycl.h" +#include "src/Tensor/TensorConvolutionSycl.h" +#include "src/Tensor/TensorContractionSycl.h" +#include "src/Tensor/TensorScanSycl.h" +#endif + +#include "src/Tensor/TensorExecutor.h" +#include "src/Tensor/TensorDevice.h" + +#include "src/Tensor/TensorStorage.h" +#include "src/Tensor/Tensor.h" +#include "src/Tensor/TensorFixedSize.h" +#include "src/Tensor/TensorMap.h" +#include "src/Tensor/TensorRef.h" +// IWYU pragma: end_exports + +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +// #endif // EIGEN_CXX11_TENSOR_MODULE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry new file mode 100644 index 00000000000..d4a44d0a704 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/TensorSymmetry @@ -0,0 +1,40 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H +#define EIGEN_CXX11_TENSORSYMMETRY_MODULE_H + +#include "Tensor" + +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" + +// #include "src/util/CXX11Meta.h" + +/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module + * + * This module provides a classes that allow for the definition of + * symmetries w.r.t. tensor indices. + * + * Including this module will implicitly include the Tensor module. + * + * \code + * #include + * \endcode + */ + +// IWYU pragma: begin_exports +#include "src/TensorSymmetry/util/TemplateGroupTheory.h" +#include "src/TensorSymmetry/Symmetry.h" +#include "src/TensorSymmetry/StaticSymmetry.h" +#include "src/TensorSymmetry/DynamicSymmetry.h" +// IWYU pragma: end_exports + +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h new file mode 100644 index 00000000000..9e4c1ed9c37 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h @@ -0,0 +1,3 @@ +#ifndef EIGEN_CXX11_TENSOR_MODULE_H +#error "Please include unsupported/Eigen/CXX11/Tensor instead of including headers inside the src directory directly." +#endif diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h new file mode 100644 index 00000000000..9dc95916c0c --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -0,0 +1,382 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class Tensor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor class. + * + * The %Tensor class is the work-horse for all \em dense tensors within Eigen. + * + * The %Tensor class encompasses only dynamic-size objects so far. + * + * The first two template parameters are required: + * \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex`. + * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). + * \tparam NumIndices_ Number of indices (i.e. rank of the tensor) + * + * The remaining template parameters are optional -- in most cases you don't have to worry about them. + * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either + * \b #AutoAlign or \b #DontAlign. + * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter + * controls alignment, which is required for vectorization. It defaults to aligning tensors. Note that tensors currently + * do not support any operations that profit from vectorization. Support for such operations (i.e. adding two tensors + * etc.) is planned. + * + * You can access elements of tensors using normal subscripting: + * + * \code + * Eigen::Tensor t(10, 10, 10, 10); + * t(0, 1, 2, 3) = 42.0; + * \endcode + * + * This class can be extended with the help of the plugin mechanism described on the page + * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN, + * \c EIGEN_TENSORBASE_PLUGIN, and \c EIGEN_READONLY_TENSORBASE_PLUGIN. + * + * Some notes: + * + *
+ *
Relation to other parts of Eigen:
+ *
The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that + * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code + * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the + * %Tensor class does not provide any of these features and is only available as a stand-alone class that just allows + * for coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to + * change dramatically.
+ *
+ * + * \ref TopicStorageOrders + */ + +template +class Tensor : public TensorBase > { + public: + typedef Tensor Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + enum { IsAligned = (EIGEN_MAX_ALIGN_BYTES > 0) && !(Options_ & DontAlign), CoordAccess = true, RawAccess = true }; + + static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + static constexpr int Options = Options_; + static constexpr int NumIndices = NumIndices_; + typedef DSizes Dimensions; + + protected: + TensorStorage m_storage; + + template + struct isOfNormalIndex { + static const bool is_array = internal::is_base_of, CustomIndices>::value; + static const bool is_int = NumTraits::IsInteger; + static const bool value = is_array | is_int; + }; + + public: + // Metadata + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, + IndexTypes... otherIndices) const { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, secondIndex, otherIndices...}}); + } + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + // custom indices + template ::value))> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const { + return coeff(internal::customIndices2Array(indices)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + template + inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, secondIndex, otherIndices...}}); + } + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + // custom indices + template ::value))> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices) { + return coeffRef(internal::customIndices2Array(indices)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + template + inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); + } + + // custom indices + template ::value))> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const { + return coeff(internal::customIndices2Array(indices)); + } + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const { + return coeff(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + + template + inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, secondIndex, otherIndices...}}); + } + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) { + return coeffRef(indices); + } + + // custom indices + template ::value))> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices) { + return coeffRef(internal::customIndices2Array(indices)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor() : m_storage() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const Self& other) : Base(other), m_storage(other.m_storage) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) + : m_storage(firstDimension, otherDimensions...) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + /** Normal Dimension */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array& dimensions) + : m_storage(internal::array_prod(dimensions), dimensions) { + EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const TensorBase& other) { + EIGEN_STATIC_ASSERT(OtherDerived::NumDimensions == Base::NumDimensions, Number_of_dimensions_must_match) + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const TensorBase& other) { + EIGEN_STATIC_ASSERT(OtherDerived::NumDimensions == Base::NumDimensions, Number_of_dimensions_must_match) + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Self&& other) : m_storage(std::move(other.m_storage)) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(Self&& other) { + m_storage = std::move(other.m_storage); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC void resize(Index firstDimension, IndexTypes... otherDimensions) { + // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array{{firstDimension, otherDimensions...}}); + } + + /** Normal Dimension */ + EIGEN_DEVICE_FUNC void resize(const array& dimensions) { +#ifndef EIGEN_NO_DEBUG + Index size = Index(1); + for (int i = 0; i < NumIndices; i++) { + internal::check_rows_cols_for_overflow::run(size, dimensions[i]); + size *= dimensions[i]; + } +#else + Index size = internal::array_prod(dimensions); +#endif + +#ifdef EIGEN_INITIALIZE_COEFFS + bool size_changed = size != this->size(); + m_storage.resize(size, dimensions); + if (size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED +#else + m_storage.resize(size, dimensions); +#endif + } + + EIGEN_DEVICE_FUNC void resize() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + // Nothing to do: rank 0 tensors have fixed size + } + + template + EIGEN_DEVICE_FUNC void resize(const Eigen::IndexList& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } + + /** Custom Dimension */ + template ::value))> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions) { + resize(internal::customIndices2Array(dimensions)); + } + + template + EIGEN_DEVICE_FUNC void resize(const Sizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } + +#ifdef EIGEN_TENSOR_PLUGIN +#include EIGEN_TENSOR_PLUGIN +#endif + + protected: + bool checkIndexRange(const array& indices) const { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::lesser_op; + using internal::logical_and_op; + + return + // check whether the indices are all >= 0 + array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const { + if (Options & RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h new file mode 100644 index 00000000000..f272354c4c5 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -0,0 +1,284 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H +#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/** \class TensorIndexPair + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor + Index Pair class. + * + * + */ +template +struct traits> : public traits { + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Pair Scalar; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorIndexPairOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval>::type> { + typedef TensorIndexPairOp type; +}; + +} // end namespace internal + +template +class TensorIndexPairOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef Pair CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexPairOp(const XprType& expr) : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorIndexPairOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + typedef typename TensorEvaluator::Dimensions Dimensions; + static constexpr int NumDims = internal::array_size::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + static constexpr int Layout = TensorEvaluator::Layout; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return CoeffReturnType(index, m_impl.coeff(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + protected: + TensorEvaluator m_impl; +}; + +namespace internal { + +/** \class TensorPairIndex + * \ingroup CXX11_Tensor_Module + * + * \brief Converts to Tensor > and reduces to Tensor. + * + */ +template +struct traits> : public traits { + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Index Scalar; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions - array_size::value; + static constexpr int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorPairReducerOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorPairReducerOp type; +}; + +} // end namespace internal + +template +class TensorPairReducerOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef Index CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPairReducerOp(const XprType& expr, const ReduceOp& reduce_op, + const Index return_dim, const Dims& reduce_dims) + : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC const ReduceOp& reduce_op() const { return m_reduce_op; } + + EIGEN_DEVICE_FUNC const Dims& reduce_dims() const { return m_reduce_dims; } + + EIGEN_DEVICE_FUNC Index return_dim() const { return m_return_dim; } + + protected: + typename XprType::Nested m_xpr; + const ReduceOp m_reduce_op; + const Index m_return_dim; + const Dims m_reduce_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorPairReducerOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename TensorIndexPairOp::CoeffReturnType PairType; + typedef typename TensorEvaluator>, + Device>::Dimensions Dimensions; + typedef typename TensorEvaluator, Device>::Dimensions InputDimensions; + static constexpr int NumDims = internal::array_size::value; + typedef array StrideDims; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory PairStorageMem; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + static constexpr int Layout = + TensorEvaluator>, Device>::Layout; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_orig_impl(op.expression(), device), + m_impl(op.expression().index_pairs().reduce(op.reduce_dims(), op.reduce_op()), device), + m_return_dim(op.return_dim()) { + gen_strides(m_orig_impl.dimensions(), m_strides); + if (Layout == static_cast(ColMajor)) { + const Index total_size = internal::array_prod(m_orig_impl.dimensions()); + m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size; + } else { + const Index total_size = internal::array_prod(m_orig_impl.dimensions()); + m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; + } + // If m_return_dim is not a valid index, returns 1 or this can crash on Windows. + m_stride_div = + ((m_return_dim >= 0) && (m_return_dim < static_cast(m_strides.size()))) ? m_strides[m_return_dim] : 1; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const PairType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = + 1.0 + (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost() + TensorOpCost::DivCost())); + return m_orig_impl.costPerCoeff(vectorized) + m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); + } + + private: + EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) { + if (m_return_dim < 0) { + return; // Won't be using the strides. + } + eigen_assert(m_return_dim < NumDims && "Asking to convert index to a dimension outside of the rank"); + + // Calculate m_stride_div and m_stride_mod, which are used to + // calculate the value of an index w.r.t. the m_return_dim. + if (Layout == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * dims[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[i + 1]; + } + } + } + + protected: + TensorEvaluator, Device> m_orig_impl; + TensorEvaluator>, Device> m_impl; + const Index m_return_dim; + StrideDims m_strides; + Index m_stride_mod; + Index m_stride_div; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h new file mode 100644 index 00000000000..8a57576b7ba --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -0,0 +1,213 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorAssign + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor assignment class. + * + * This class is represents the assignment of the values resulting from the evaluation of + * the rhs expression to the memory locations denoted by the lhs expression. + */ +namespace internal { +template +struct traits > { + typedef typename LhsXprType::Scalar Scalar; + typedef typename traits::StorageKind StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + static constexpr std::size_t NumDimensions = internal::traits::NumDimensions; + static constexpr int Layout = internal::traits::Layout; + typedef typename traits::PointerType PointerType; + + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorAssignOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorAssignOp type; +}; + +} // end namespace internal + +template +class TensorAssignOp : public TensorBase > { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename LhsXprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + static constexpr int NumDims = Eigen::internal::traits::NumDimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC internal::remove_all_t& lhsExpression() const { + return *((internal::remove_all_t*)&m_lhs_xpr); + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& rhsExpression() const { + return m_rhs_xpr; + } + + protected: + internal::remove_all_t& m_lhs_xpr; + const internal::remove_all_t& m_rhs_xpr; +}; + +template +struct TensorEvaluator, Device> { + typedef TensorAssignOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int PacketSize = PacketType::size; + static constexpr int NumDims = XprType::NumDims; + static constexpr int Layout = TensorEvaluator::Layout; + + enum { + IsAligned = + int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), + RawAccess = TensorEvaluator::RawAccess + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock RightTensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { + // The dimensions of the lhs and the rhs tensors should be equal to prevent + // overflows and ensure the result is fully initialized. + // TODO: use left impl instead if right impl dimensions are known at compile time. + return m_rightImpl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + m_leftImpl.evalSubExprsIfNeeded(NULL); + // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non + // null value), attempt to evaluate the rhs expression in place. Returns true iff in place + // evaluation isn't supported and the caller still needs to manually assign the values generated + // by the rhs to the lhs. + return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(m_leftImpl.data(), [done](bool need_assign) { done(need_assign); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const { + m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const { + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_leftImpl.coeff(index); } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return m_leftImpl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here, but reduce left + // cost by one load because we are using m_leftImpl.coeffRef. + TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); + return m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), left.bytes_stored(), + left.compute_cycles()) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::merge(m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlockDesc& desc, TensorBlockScratch& scratch) { + if (TensorEvaluator::RawAccess && m_leftImpl.data() != NULL) { + // If destination has raw data access, we pass it as a potential + // destination for a block descriptor evaluation. + desc.template AddDestinationBuffer( + /*dst_base=*/m_leftImpl.data() + desc.offset(), + /*dst_strides=*/internal::strides(m_leftImpl.dimensions())); + } + + RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true); + // If block was evaluated into a destination, there is no need to do assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + m_leftImpl.writeBlock(desc, block); + } + block.cleanup(); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); } + + private: + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h new file mode 100644 index 00000000000..fc3f3b78a84 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -0,0 +1,1244 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H +#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H + +// clang-format off + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorBase + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor base class. + * + * This class is the common parent of the Tensor and TensorMap class, thus + * making it possible to use either class interchangeably in expressions. + */ +#ifndef EIGEN_PARSED_BY_DOXYGEN +// FIXME Doxygen does not like the inheritance with different template parameters +// Since there is no doxygen documentation inside, we disable it for now +template +class TensorBase +{ + public: + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef std::remove_const_t CoeffReturnType; + static constexpr int NumDimensions = DerivedTraits::NumDimensions; + + // Generic nullary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + nullaryExpr(const CustomNullaryOp& func) const { + return TensorCwiseNullaryOp(derived(), func); + } + + // Coefficient-wise nullary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + constant(const Scalar& value) const { + return nullaryExpr(internal::scalar_constant_op(value)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + random() const { + return nullaryExpr(internal::UniformRandomGenerator()); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + random(const RandomGenerator& gen = RandomGenerator()) const { + return nullaryExpr(gen); + } + + // Tensor generation + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorGeneratorOp + generate(const Generator& generator) const { + return TensorGeneratorOp(derived(), generator); + } + + // Generic unary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp(derived(), func); + } + + // Coefficient-wise unary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator-() const { + return unaryExpr(internal::scalar_opposite_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sqrt() const { + return unaryExpr(internal::scalar_sqrt_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sign() const { + return unaryExpr(internal::scalar_sign_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + rsqrt() const { + return unaryExpr(internal::scalar_rsqrt_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + square() const { + return unaryExpr(internal::scalar_square_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cube() const { + return unaryExpr(internal::scalar_cube_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + inverse() const { + return unaryExpr(internal::scalar_inverse_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + tanh() const { + return unaryExpr(internal::scalar_tanh_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + lgamma() const { + return unaryExpr(internal::scalar_lgamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + digamma() const { + return unaryExpr(internal::scalar_digamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i0() const { + return unaryExpr(internal::scalar_bessel_i0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i0e() const { + return unaryExpr(internal::scalar_bessel_i0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i1() const { + return unaryExpr(internal::scalar_bessel_i1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i1e() const { + return unaryExpr(internal::scalar_bessel_i1e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_j0() const { + return unaryExpr(internal::scalar_bessel_j0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_y0() const { + return unaryExpr(internal::scalar_bessel_y0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_j1() const { + return unaryExpr(internal::scalar_bessel_j1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_y1() const { + return unaryExpr(internal::scalar_bessel_y1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k0() const { + return unaryExpr(internal::scalar_bessel_k0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k0e() const { + return unaryExpr(internal::scalar_bessel_k0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k1() const { + return unaryExpr(internal::scalar_bessel_k1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k1e() const { + return unaryExpr(internal::scalar_bessel_k1e_op()); + } + + // igamma(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_op()); + } + + // igamma_der_a(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma_der_a(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op()); + } + + // gamma_sample_der_alpha(alpha = this, sample = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + gamma_sample_der_alpha(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op()); + } + + // igammac(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igammac(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igammac_op()); + } + + // zeta(x = this, q = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + zeta(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_zeta_op()); + } + + // polygamma(n = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + polygamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_polygamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erf() const { + return unaryExpr(internal::scalar_erf_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erfc() const { + return unaryExpr(internal::scalar_erfc_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ndtri() const { + return unaryExpr(internal::scalar_ndtri_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sigmoid() const { + return unaryExpr(internal::scalar_logistic_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + exp() const { + return unaryExpr(internal::scalar_exp_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + expm1() const { + return unaryExpr(internal::scalar_expm1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log() const { + return unaryExpr(internal::scalar_log_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log1p() const { + return unaryExpr(internal::scalar_log1p_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log2() const { + return unaryExpr(internal::scalar_log2_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + abs() const { + return unaryExpr(internal::scalar_abs_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + arg() const { + return unaryExpr(internal::scalar_arg_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + clip(Scalar min, Scalar max) const { + return unaryExpr(internal::scalar_clamp_op(min, max)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const std::conditional_t::IsComplex, + TensorCwiseUnaryOp, const Derived>, + Derived> + conjugate() const { + return choose(Cond::IsComplex>(), unaryExpr(internal::scalar_conjugate_op()), derived()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::enable_if_t::Real>::value, + TensorCwiseUnaryOp, const Derived>> + pow(ScalarExponent exponent) const + { + return unaryExpr(internal::scalar_unary_pow_op(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + real() const { + return unaryExpr(internal::scalar_real_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + imag() const { + return unaryExpr(internal::scalar_imag_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator+ (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator+ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator- (Scalar rhs) const { + EIGEN_STATIC_ASSERT((NumTraits::IsSigned || internal::is_same >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator- (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator* (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator* (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator/ (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator/ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator% (Scalar rhs) const { + EIGEN_STATIC_ASSERT(NumTraits::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD); + return unaryExpr(internal::scalar_mod_op(rhs)); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMax(Scalar threshold) const { + return cwiseMax(constant(threshold)); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMin(Scalar threshold) const { + return cwiseMin(constant(threshold)); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const std::conditional_t::value, + Derived, + TensorConversionOp > + cast() const { + return choose(Cond::value>(), derived(), TensorConversionOp(derived())); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + round() const { + return unaryExpr(internal::scalar_round_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + rint() const { + return unaryExpr(internal::scalar_rint_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ceil() const { + return unaryExpr(internal::scalar_ceil_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + floor() const { + return unaryExpr(internal::scalar_floor_op()); + } + + // Generic binary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp + binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { + return TensorCwiseBinaryOp(derived(), other, func); + } + + // Coefficient-wise binary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator+(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_sum_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator-(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_difference_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator*(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_product_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator/(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_quotient_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMax(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_max_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMin(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_min_op()); + } + + // logical operators + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator&&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator||(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_bitwise_and_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator|(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_bitwise_or_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator^(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_bitwise_xor_op()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseUnaryOp, const Derived> + operator!() const { + return unaryExpr(internal::scalar_boolean_not_op()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseUnaryOp, const Derived> + operator~() const { + return unaryExpr(internal::scalar_bitwise_not_op()); + } + + // Comparisons and tests. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<=(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>=(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator==(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator!=(const TensorBase& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + // comparisons and tests for Scalars + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator<(Scalar threshold) const { + return operator<(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator<=(Scalar threshold) const { + return operator<=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator>(Scalar threshold) const { + return operator>(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator>=(Scalar threshold) const { + return operator>=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator==(Scalar threshold) const { + return operator==(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator!=(Scalar threshold) const { + return operator!=(constant(threshold)); + } + + // Predicates. + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorConversionOp, const Derived>> + (isnan)() const { + return unaryExpr(internal::scalar_isnan_op()).template cast(); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorConversionOp, const Derived>> + (isinf)() const { + return unaryExpr(internal::scalar_isinf_op()).template cast(); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorConversionOp, const Derived>> + (isfinite)() const { + return unaryExpr(internal::scalar_isfinite_op()).template cast(); + } + + // Coefficient-wise ternary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { + return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); + } + + // Contractions. + typedef Eigen::IndexPair DimensionPair; + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims) const { + return TensorContractionOp(derived(), other.derived(), dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const { + return TensorContractionOp(derived(), other.derived(), dims, output_kernel); + } + + // Convolutions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConvolutionOp + convolve(const KernelDerived& kernel, const Dimensions& dims) const { + return TensorConvolutionOp(derived(), kernel.derived(), dims); + } + + // Fourier transforms + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorFFTOp + fft(const FFT& dims) const { + return TensorFFTOp(derived(), dims); + } + + // Scan. + typedef TensorScanOp, const Derived> TensorScanSumOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanSumOp + cumsum(const Index& axis, bool exclusive = false) const { + return TensorScanSumOp(derived(), axis, exclusive); + } + + typedef TensorScanOp, const Derived> TensorScanProdOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanProdOp + cumprod(const Index& axis, bool exclusive = false) const { + return TensorScanProdOp(derived(), axis, exclusive); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanOp + scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const { + return TensorScanOp(derived(), axis, exclusive, reducer); + } + + // Reductions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + sum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + sum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::SumReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + mean(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MeanReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + mean() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MeanReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + prod(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::ProdReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + prod() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::ProdReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + maximum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + } + + template + const TensorReductionOp, const DimensionList, const Derived> + maximum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + minimum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + } + + template + const TensorReductionOp, const DimensionList, const Derived> + minimum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp::value, Derived, TensorConversionOp > > + all(const Dims& dims) const { + return cast().reduce(dims, internal::AndReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const std::conditional_t::value, Derived, TensorConversionOp > > + all() const { + DimensionList in_dims; + return cast().reduce(in_dims, internal::AndReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp::value, Derived, TensorConversionOp > > + any(const Dims& dims) const { + return cast().reduce(dims, internal::OrReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const std::conditional_t::value, Derived, TensorConversionOp > > + any() const { + DimensionList in_dims; + return cast().reduce(in_dims, internal::OrReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPairReducerOp< + internal::ArgMaxPairReducer >, + const array, const Derived> + argmax() const { + array in_dims; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorPairReducerOp< + internal::ArgMaxPairReducer >, + const array, + const Derived>(derived(), internal::ArgMaxPairReducer >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPairReducerOp< + internal::ArgMinPairReducer >, + const array, const Derived> + argmin() const { + array in_dims; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorPairReducerOp< + internal::ArgMinPairReducer >, + const array, + const Derived>(derived(), internal::ArgMinPairReducer >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPairReducerOp< + internal::ArgMaxPairReducer >, + const array, const Derived> + argmax(const Index return_dim) const { + array in_dims; + in_dims[0] = return_dim; + return TensorPairReducerOp< + internal::ArgMaxPairReducer >, + const array, + const Derived>(derived(), internal::ArgMaxPairReducer >(), return_dim, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPairReducerOp< + internal::ArgMinPairReducer >, + const array, const Derived> + argmin(const Index return_dim) const { + array in_dims; + in_dims[0] = return_dim; + return TensorPairReducerOp< + internal::ArgMinPairReducer >, + const array, + const Derived>(derived(), internal::ArgMinPairReducer >(), return_dim, in_dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp + reduce(const Dims& dims, const Reducer& reducer) const { + return TensorReductionOp(derived(), dims, reducer); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTraceOp + trace(const Dims& dims) const { + return TensorTraceOp(derived(), dims); + } + + const TensorTraceOp, const Derived> + trace() const { + DimensionList in_dims; + return TensorTraceOp, const Derived>(derived(), in_dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorBroadcastingOp + broadcast(const Broadcast& bcast) const { + return TensorBroadcastingOp(derived(), bcast); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, Axis axis) const { + return TensorConcatenationOp(derived(), other.derived(), axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPatchOp + extract_patches(const PatchDims& patch_dims) const { + return TensorPatchOp(derived(), patch_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1, + const Index row_stride = 1, const Index col_stride = 1, + const Index in_row_stride = 1, const Index in_col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, 1, 1, padding_type, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride, + const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top, const Index padding_bottom, + const Index padding_left,const Index padding_right, + const Scalar padding_value) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride, + padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { + return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride, const Index row_stride, const Index col_stride, + const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top_z, const Index padding_bottom_z, + const Index padding_top, const Index padding_bottom, + const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { + return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + // Morphing operators. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range) + return TensorChippingOp(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorRollOp + roll(const Rolls& rolls) const { + return TensorRollOp(derived(), rolls); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding) const { + return TensorPaddingOp(derived(), padding, internal::scalar_cast_op()(0)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding, const Scalar padding_value) const { + return TensorPaddingOp(derived(), padding, padding_value); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shfl) const { + return TensorShufflingOp(derived(), shfl); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorInflationOp + inflate(const Strides& strides) const { + return TensorInflationOp(derived(), strides); + } + + // Returns a tensor containing index/value pairs + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorIndexPairOp + index_pairs() const { + return TensorIndexPairOp(derived()); + } + + // Support for custom unary and binary operations + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomUnaryOp customOp(const CustomUnaryFunc& op) const { + return TensorCustomUnaryOp(derived(), op); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomBinaryOp customOp(const OtherDerived& other, const CustomBinaryFunc& op) const { + return TensorCustomBinaryOp(derived(), other, op); + } + + // Force the evaluation of the expression. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorForcedEvalOp eval() const { + return TensorForcedEvalOp(derived()); + } + + // Returns a formatted tensor ready for printing to a stream + template + inline const TensorWithFormat format(const Format& fmt) const { + return TensorWithFormat(derived(), fmt); + } + + #ifdef EIGEN_READONLY_TENSORBASE_PLUGIN + #include EIGEN_READONLY_TENSORBASE_PLUGIN + #endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } + + protected: + template friend class Tensor; + template friend class TensorFixedSize; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; +}; + +template::value> +class TensorBase : public TensorBase { + public: + typedef TensorBase Base; + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef Scalar CoeffReturnType; + static constexpr int NumDimensions = DerivedTraits::NumDimensions; + + template friend class Tensor; + template friend class TensorFixedSize; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { + return setConstant(Scalar(0)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { + return derived() = this->constant(val); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->random(); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->template random(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setValues( + const typename internal::Initializer::InitList& vals) { + TensorEvaluator eval(derived(), DefaultDevice()); + internal::initialize_tensor(eval, vals); + return derived(); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const OtherDerived& other) { + return derived() = derived() + other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const OtherDerived& other) { + return derived() = derived() - other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const OtherDerived& other) { + return derived() = derived() * other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const OtherDerived& other) { + return derived() = derived() / other.derived(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp + swap_layout() { + return TensorLayoutSwapOp(derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) const { + return TensorConcatenationOp(derived(), other, axis); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) { + return TensorConcatenationOp(derived(), other, axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReshapingOp + reshape(const NewDimensions& newDimensions) { + return TensorReshapingOp(derived(), newDimensions); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) { + return TensorSlicingOp(derived(), startIndices, sizes); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range) + return TensorChippingOp(derived(), offset, DimId); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset) { + EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range) + return TensorChippingOp(derived(), offset, DimId); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset, const Index dim) { + return TensorChippingOp(derived(), offset, dim); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReverseOp + reverse(const ReverseDimensions& rev) { + return TensorReverseOp(derived(), rev); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorRollOp + roll(const Rolls& roll) const { + return TensorRollOp(derived(), roll); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorRollOp + roll(const Rolls& roll) { + return TensorRollOp(derived(), roll); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shfl) const { + return TensorShufflingOp(derived(), shfl); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp + shuffle(const Shuffle& shfl) { + return TensorShufflingOp(derived(), shfl); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp + stride(const Strides& strides) { + return TensorStridingOp(derived(), strides); + } + + // Select the device on which to evaluate the expression. + template + TensorDevice device(const DeviceType& dev) { + return TensorDevice(dev, derived()); + } + + // Select the async device on which to evaluate the expression. + template + TensorAsyncDevice device(const DeviceType& dev, DoneCallback done) { + return TensorAsyncDevice(dev, derived(), std::move(done)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } + + #ifdef EIGEN_TENSORBASE_PLUGIN + #include EIGEN_TENSORBASE_PLUGIN + #endif + + protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase) + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase) + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(derived(), other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return derived(); + } +}; +#endif // EIGEN_PARSED_BY_DOXYGEN +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h new file mode 100644 index 00000000000..0b068a7c99e --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -0,0 +1,1474 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H +#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +// -------------------------------------------------------------------------- // +// Forward declarations for templates defined below. +template +class TensorBlockIO; + +// -------------------------------------------------------------------------- // +// Helper function to compute strides for densely stored buffer of given +// dimensions. + +// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use +// this function instead everywhere. +template +EIGEN_ALWAYS_INLINE DSizes strides(const DSizes& dimensions) { + DSizes strides; + if (NumDims == 0) return strides; + + // TODO(ezhulenev): Use templates to unroll this loop (similar to + // h_array_reduce in CXX11meta.h)? Benchmark it. + if (static_cast(Layout) == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * dimensions[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dimensions[i + 1]; + } + } + + return strides; +} + +template +EIGEN_ALWAYS_INLINE DSizes strides(const Eigen::array& dimensions) { + return strides(DSizes(dimensions)); +} + +template +EIGEN_STRONG_INLINE DSizes strides(const Sizes& sizes) { + return strides(DSizes(sizes)); +} + +// -------------------------------------------------------------------------- // + +// Tensor block shape type defines what are the shape preference for the blocks +// extracted from the larger tensor. +// +// Example: blocks of 100 elements from the large 100x100 tensor: +// - tensor: 100x100 +// - target_block_size: 100 +// +// TensorBlockShapeType: +// - kUniformAllDims: 100 blocks of size 10x10 +// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column +// or row major layout) +enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; + +struct TensorBlockResourceRequirements { + TensorBlockShapeType shape_type; // target block shape + size_t size; // target block size + TensorOpCost cost_per_coeff; // cost of computing a single block element + +#ifdef EIGEN_HIPCC + // For HIPCC, we need to explicitly declare as a "device fun", the constructor + // which is implicitly invoked in the "merge" / "any" routines. else HIPCC + // errors out complaining about the lack of a matching constructor + EIGEN_DEVICE_FUNC TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, TensorOpCost cost_) + : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) {} +#endif + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, + size_t size_in_bytes, TensorOpCost cost) { + const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); + return {shape_type, size, cost}; + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, + size_t size_in_bytes) { + // This default cost per coefficient is valid for most materialized tensor + // block evaluation implementations, because they typically just read + // coefficients from the underlying tensor storage, and write to the tensor + // block buffer (scratch or destination memory, reads and writes have linear + // access pattern). We ignore the fixed cost of block evaluation, because in + // practice it should negligible. + // + // Lazy block evaluation adds the cost of calling a functor for each + // coefficient. + // + // All non-trivial block evaluation implementations must provide their own + // cost approximation (e.g. shuffling inner dimension has a much higher cost + // because it reads memory randomly, although the total number of moved + // bytes is the same). + return withShapeAndSize(shape_type, size_in_bytes, + {/*bytes_loaded=*/sizeof(Scalar), + /*bytes_stored=*/sizeof(Scalar), + /*compute_cycles=*/0}); + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(size_t size_in_bytes) { + return withShapeAndSize(TensorBlockShapeType::kSkewedInnerDims, size_in_bytes); + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(size_t size_in_bytes) { + return withShapeAndSize(TensorBlockShapeType::kUniformAllDims, size_in_bytes); + } + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements + merge(const TensorBlockResourceRequirements& lhs, const TensorBlockResourceRequirements& rhs) { + return {merge(lhs.shape_type, rhs.shape_type), // shape_type + merge(lhs.size, rhs.size), // size + merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff + } + + EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(TensorOpCost cost) { + cost_per_coeff += cost; + return *this; + } + + // This is a resource requirement that should be returned from expressions + // that do not have any block evaluation preference (e.g. default tensor + // expression with raw buffer access). + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { + return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; + } + + private: + using Requirements = TensorBlockResourceRequirements; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { + return numext::maxi(lhs_size, rhs_size); + } + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs, + TensorBlockShapeType rhs) { + return (lhs == TensorBlockShapeType::kSkewedInnerDims || rhs == TensorBlockShapeType::kSkewedInnerDims) + ? TensorBlockShapeType::kSkewedInnerDims + : TensorBlockShapeType::kUniformAllDims; + } + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost) { + return lhs_cost + rhs_cost; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockDescriptor specifies a block offset within a tensor and the block +// sizes along each of the tensor dimensions. + +template +class TensorBlockDescriptor { + public: + typedef DSizes Dimensions; + + // If we evaluate a Tensor assignment, and expression on the left, already has + // a memory buffer, then we might do performance optimization, and evaluate + // the root expression directly into the final output memory. Some time it's + // possible to reuse it for materializing subexpressions inside an expression + // tree, to to avoid dynamic memory allocation. + // + // The pointer type of the underlying storage is erased, because passing + // Scalar type through all the expression evaluation layers is way too many + // templates. In practice destination buffer type should always match the + // evaluated expression scalar type. + class DestinationBuffer { + public: + enum DestinationBufferKind : int { + // The above explicit specification of "int" as the enum basetype is + // needed to get around a HIPCC link error ("the field type is not + // amp-compatible") + // which is issued for class members with the enum type. + // TODO(rocm): + // remove the "int" basetype once HIPCC has been fixed to not error out + // in the above scenario. + + // Destination buffer is not defined (`m_data` == nullptr). + kEmpty, + + // Tensor block defined by an owning tensor block descriptor can fit + // contiguously into the destination buffer. In this case it's safe to + // materialize tensor block in the destination buffer, wrap it in a + // TensorMap, and use to build Eigen expression on top of it. + kContiguous, + + // Destination buffer strides do not match strides of the contiguously + // stored block, and it's impossible to define a TensorMap over this + // buffer. However if we are evaluating a root of an expression tree, we + // still can materialize an output into this destination, because we can + // guarantee that no one will ever access it through block API. + // + // In theory it is possible to build valid TensorStriding + // expression on top of this destination buffer, however it has + // inefficient coeff/packet access, and defeats the purpose of fast block + // evaluation API. + kStrided + }; + + template + Scalar* data() const { + eigen_assert(m_data_type_size == sizeof(Scalar)); + return static_cast(m_data); + } + + const Dimensions& strides() const { return m_strides; } + const DestinationBufferKind& kind() const { return m_kind; } + + private: + friend class TensorBlockDescriptor; + + DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} + + template + DestinationBuffer(Scalar* data, const Dimensions& strides, DestinationBufferKind kind) + : m_data(static_cast(data)), m_data_type_size(sizeof(Scalar)), m_strides(strides), m_kind(kind) {} + + template + static DestinationBuffer make(const TensorBlockDescriptor& desc, Scalar* data, const Dimensions& strides) { + return DestinationBuffer(data, strides, kind(desc, strides)); + } + + template + static DestinationBufferKind kind(const TensorBlockDescriptor& desc, const Dimensions& strides) { + const Dimensions& desc_dims = desc.dimensions(); + const Dimensions& desc_strides = internal::strides(desc_dims); + for (int i = 0; i < NumDims; ++i) { + if (desc_dims[i] == 1) continue; + if (desc_strides[i] != strides[i]) return kStrided; + } + return kContiguous; + } + + // Storage pointer is type erased, to reduce template bloat, but we still + // keep the size of the underlying element type for error checking. + void* m_data; + size_t m_data_type_size; + + // Destination buffer dimensions always match the dimensions of a tensor + // block descriptor it belongs to, however strides might be different. + Dimensions m_strides; + + DestinationBufferKind m_kind; + }; + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, const DestinationBuffer& destination) + : m_offset(offset), m_dimensions(dimensions), m_destination(destination) {} + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) + : m_offset(offset), m_dimensions(dimensions), m_destination(DestinationBuffer()) {} + + IndexType offset() const { return m_offset; } + const Dimensions& dimensions() const { return m_dimensions; } + IndexType dimension(int index) const { return m_dimensions[index]; } + IndexType size() const { return array_prod(m_dimensions); } + + const DestinationBuffer& destination() const { return m_destination; } + + template + void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { + eigen_assert(dst_base != NULL); + m_destination = DestinationBuffer::template make(*this, dst_base, dst_strides); + } + + template + void AddDestinationBuffer(Scalar* dst_base, const DSizes& dst_strides) { + // DSizes constructor will do index type promotion if it's safe. + AddDestinationBuffer(dst_base, Dimensions(dst_strides)); + } + + TensorBlockDescriptor& DropDestinationBuffer() { + m_destination.m_data = NULL; + m_destination.m_kind = DestinationBuffer::kEmpty; + return *this; + } + + bool HasDestinationBuffer() const { return m_destination.kind() != DestinationBuffer::kEmpty; } + + // Returns a copy of `*this` with updated offset. + TensorBlockDescriptor WithOffset(IndexType offset) const { + return TensorBlockDescriptor(offset, m_dimensions, m_destination); + } + + private: + // Offset and dimensions are immutable after construction. Block descriptor + // can only be mutated by adding or dropping destination. + const IndexType m_offset; + const Dimensions m_dimensions; + DestinationBuffer m_destination; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockMapper is responsible for iterating over the blocks of a tensor. + +template +class TensorBlockMapper { + typedef TensorBlockDescriptor BlockDescriptor; + + public: + typedef DSizes Dimensions; + + TensorBlockMapper() = default; + TensorBlockMapper(const DSizes& dimensions, const TensorBlockResourceRequirements& requirements) + : m_tensor_dimensions(dimensions), m_requirements(requirements) { + // Compute block dimensions and the total number of blocks. + InitializeBlockDimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { return m_total_block_count; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { return m_block_dimensions.TotalSize(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& blockDimensions() const { + return m_block_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const { + static const bool isColMajor = Layout == static_cast(ColMajor); + + IndexType offset = 0; + DSizes dimensions; + + if (NumDims == 0) return BlockDescriptor(offset, dimensions); + + // Iterate outer -> inner dimensions. + for (int i = NumDims - 1; i >= 0; --i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + const IndexType idx = block_index / m_block_strides[dim]; + block_index -= idx * m_block_strides[dim]; + + const IndexType coord = idx * m_block_dimensions[dim]; + dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, m_block_dimensions[dim]); + offset += coord * m_tensor_strides[dim]; + } + + return {offset, dimensions}; + } + + private: + void InitializeBlockDimensions() { + // Requested block shape and size. + const TensorBlockShapeType shape_type = m_requirements.shape_type; + IndexType target_block_size = numext::maxi(1, static_cast(m_requirements.size)); + + IndexType tensor_size = m_tensor_dimensions.TotalSize(); + + // Corner case: one of the dimensions is zero. Logic below is too complex + // to handle this case on a general basis, just use unit block size. + // Note: we must not yield blocks with zero dimensions (recipe for + // overflows/underflows, divisions by zero and NaNs later). + if (tensor_size == 0) { + for (int i = 0; i < NumDims; ++i) { + m_block_dimensions[i] = 1; + } + m_total_block_count = 0; + return; + } + + // If tensor fits into a target block size, evaluate it as a single block. + if (tensor_size <= target_block_size) { + m_block_dimensions = m_tensor_dimensions; + m_total_block_count = 1; + // The only valid block index is `0`, and in this case we do not need + // to compute real strides for tensor or blocks (see blockDescriptor). + for (int i = 0; i < NumDims; ++i) { + m_tensor_strides[i] = 0; + m_block_strides[i] = 1; + } + return; + } + + static const bool isColMajor = Layout == static_cast(ColMajor); + + // Block shape skewed towards inner dimension. + if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { + IndexType coeff_to_allocate = target_block_size; + + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); + coeff_to_allocate = + numext::div_ceil(coeff_to_allocate, numext::maxi(static_cast(1), m_block_dimensions[dim])); + } + eigen_assert(coeff_to_allocate == 1); + + } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { + // Tensor will not fit within 'target_block_size' budget: calculate tensor + // block dimension sizes based on "square" dimension size target. + const IndexType dim_size_target = convert_index( + std::pow(static_cast(target_block_size), 1.0f / static_cast(m_block_dimensions.rank()))); + + for (int i = 0; i < NumDims; ++i) { + // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it + // a multiple of the packet size. Note that reducing + // 'block_dim_size' in this manner can increase the number of + // blocks, and so will amplify any per-block overhead. + m_block_dimensions[i] = numext::mini(dim_size_target, m_tensor_dimensions[i]); + } + + // Add any un-allocated coefficients to inner dimension(s). + IndexType total_size = m_block_dimensions.TotalSize(); + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { + const IndexType total_size_other_dims = total_size / m_block_dimensions[dim]; + const IndexType alloc_avail = numext::div_ceil(target_block_size, total_size_other_dims); + if (alloc_avail == m_block_dimensions[dim]) { + // Insufficient excess coefficients to allocate. + break; + } + m_block_dimensions[dim] = numext::mini(m_tensor_dimensions[dim], alloc_avail); + total_size = total_size_other_dims * m_block_dimensions[dim]; + } + } + + } else { + eigen_assert(false); // unknown block shape + } + + eigen_assert(m_block_dimensions.TotalSize() >= + numext::mini(target_block_size, m_tensor_dimensions.TotalSize())); + + // Calculate block counts by dimension and total block count. + DSizes block_count; + for (int i = 0; i < NumDims; ++i) { + block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]); + } + m_total_block_count = array_prod(block_count); + + // Calculate block strides (used for enumerating blocks). + m_tensor_strides = strides(m_tensor_dimensions); + m_block_strides = strides(block_count); + } + + DSizes m_tensor_dimensions; + TensorBlockResourceRequirements m_requirements; + + DSizes m_block_dimensions; + IndexType m_total_block_count; + + DSizes m_tensor_strides; + DSizes m_block_strides; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockScratchAllocator is responsible for allocating temporary buffers +// for block evaluation (output or input block materialization). Given that +// Eigen expression traversal order is deterministic, all temporary allocations +// are happening in the same order, and usually have exactly the same size. +// Scratch allocator keeps a trace of all dynamic allocations, and after the +// first block evaluation is completed, we should be able to reuse all the +// temporary buffers for the next block evaluation. + +template +class TensorBlockScratchAllocator { + public: + explicit TensorBlockScratchAllocator(const Device& device) : m_device(device), m_allocation_index(0) {} + + ~TensorBlockScratchAllocator() { + for (size_t i = 0; i < m_allocations.size(); ++i) { + m_device.deallocate(m_allocations[i].ptr); + } + } + + void* allocate(size_t size) { + // TODO(ezhulenev): Remove when replaced with inlined vector. + if (m_allocations.capacity() == 0) m_allocations.reserve(8); + + // Check if we already have an existing allocation att current index. + const int num_allocations = static_cast(m_allocations.size()); + const bool has_allocation = m_allocation_index < num_allocations; + + // Allocation index can't be larger than the number of allocations. + eigen_assert(m_allocation_index <= num_allocations); + + // If we have existing allocation, and its size is larger or equal to + // requested size, we do nothing. + + // If current allocation can't fit requested size, we deallocate it, and + // replace with a larger allocation. + if (has_allocation && m_allocations[m_allocation_index].size < size) { + m_device.deallocate(m_allocations[m_allocation_index].ptr); + m_allocations[m_allocation_index].ptr = m_device.allocate(size); + m_allocations[m_allocation_index].size = size; + } + + // Make a new allocation if we don't have and existing one. + if (!has_allocation) { + Allocation allocation; + allocation.ptr = m_device.allocate(size); + allocation.size = size; + m_allocations.push_back(allocation); + } + + eigen_assert(m_allocations[m_allocation_index].ptr != NULL); + eigen_assert(m_allocations[m_allocation_index].size >= size); + + return m_allocations[m_allocation_index++].ptr; + } + + void reset() { m_allocation_index = 0; } + + private: + struct Allocation { + void* ptr; + size_t size; + }; + + const Device& m_device; + int m_allocation_index; + // TODO(ezhulenev): This should be an inlined vector. + std::vector m_allocations; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockKind represents all possible block kinds, that can be produced by +// TensorEvaluator::evalBlock function. +enum TensorBlockKind { + // Tensor block that is a lazy expression that must be assigned to a + // destination using TensorBlockAssign. + kExpr, + + // Tensor block that is a view into a memory buffer owned by an underlying + // Tensor expression (e.g. it can be a view into a Tensor buffer). + kView, + + // Tensor block that was materialized in a scratch memory buffer, allocated + // with TensorBlockScratchAllocator. This block must be copied to a + // destination, similar to a block of `kExpr` type. + kMaterializedInScratch, + + // Tensor block that was materialized directly into the final output memory + // buffer. For example if the left side of an assignment is a Tensor, we can + // directly materialize the block in the destination memory. + // + // If strides in the output buffer do not match tensor block strides, the + // Tensor expression will be invalid, and should not be used by + // TensorBlockAssign or for constructing another block expression. + kMaterializedInOutput +}; + +// -------------------------------------------------------------------------- // +// TensorBlockNotImplemented should be used to defined TensorBlock typedef in +// TensorEvaluators that do not support block evaluation. + +class TensorBlockNotImplemented { + public: + typedef void XprType; +}; + +// -------------------------------------------------------------------------- // +// XprScalar extracts Scalar type from the Eigen expressions (if expression type +// is not void). It's required to be able to define lazy block expression for +// argument types, that do not support block evaluation. + +template +struct XprScalar { + typedef typename XprType::Scalar type; +}; +template <> +struct XprScalar { + typedef void type; +}; + +// -------------------------------------------------------------------------- // +// TensorMaterializedBlock is a fully evaluated block of the original tensor, +// and XprType is just a TensorMap over the data. This block type is typically +// used to materialize blocks of tensor expressions, that can't be efficiently +// represented as lazy Tensor expressions with fast coeff/packet operations, +// e.g. we materialize all broadcasts into evaluated blocks. +// +// TensorMaterializedBlock does not own its memory buffer, it's either a memory +// buffer that backs the original expression (e.g. block is just a view into a +// Tensor), or a memory buffer allocated with scratch allocator, and in this +// case the scratch allocator will deallocate it at the end of block based +// expression execution. +// +// If the block was evaluated directly into the output buffer, and strides in +// the output buffer do not match block strides, the TensorMap expression will +// be invalid, and should never be used in block assignment or any other tensor +// expression. + +template +class TensorMaterializedBlock { + public: + typedef DSizes Dimensions; + typedef TensorMap > XprType; + + TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, const Dimensions& dimensions, + bool valid_expr = true) + : m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) { + eigen_assert(m_kind == internal::TensorBlockKind::kView || + m_kind == internal::TensorBlockKind::kMaterializedInScratch || + m_kind == internal::TensorBlockKind::kMaterializedInOutput); + } + + TensorBlockKind kind() const { return m_kind; } + // NOTE(ezhulenev): Returning XprType by value like in other block types + // causes asan failures. The theory is that XprType::Nested doesn't work + // properly for TensorMap. + const XprType& expr() const { + eigen_assert(m_valid_expr); + return m_expr; + } + const Scalar* data() const { return m_data; } + void cleanup() {} + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + + // TensorMaterializedBlock can be backed by different types of storage: + // + // (1) Contiguous block of memory allocated with scratch allocator. + // (2) Contiguous block of memory reused from tensor block descriptor + // destination buffer. + // (3) Strided block of memory reused from tensor block descriptor + // destination buffer. + // + class Storage { + public: + Scalar* data() const { return m_data; } + const Dimensions& dimensions() const { return m_dimensions; } + const Dimensions& strides() const { return m_strides; } + + TensorMaterializedBlock AsTensorMaterializedBlock() const { + return TensorMaterializedBlock(m_materialized_in_output ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + m_data, m_dimensions, !m_strided_storage); + } + + private: + friend class TensorMaterializedBlock; + + Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output, + bool strided_storage) + : m_data(data), + m_dimensions(dimensions), + m_strides(strides), + m_materialized_in_output(materialized_in_output), + m_strided_storage(strided_storage) {} + + Scalar* m_data; + Dimensions m_dimensions; + Dimensions m_strides; + bool m_materialized_in_output; + bool m_strided_storage; + }; + + // Creates a storage for materialized block either from the block descriptor + // destination buffer, or allocates a new buffer with scratch allocator. + template + EIGEN_STRONG_INLINE static Storage prepareStorage(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool allow_strided_storage = false) { + // Try to reuse destination as an output block buffer. + typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; + + if (desc.destination().kind() == DestinationBuffer::kContiguous) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), internal::strides(desc.dimensions()), + /*materialized_in_output=*/true, + /*strided_storage=*/false); + + } else if (desc.destination().kind() == DestinationBuffer::kStrided && allow_strided_storage) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), desc.destination().strides(), + /*materialized_in_output=*/true, /*strided_storage=*/true); + + } else { + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + return Storage(static_cast(mem), desc.dimensions(), internal::strides(desc.dimensions()), + /*materialized_in_output=*/false, + /*strided_storage=*/false); + } + } + + // Creates a materialized block for the given descriptor from a memory buffer. + template + EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(const Scalar* data, const DataDimensions& data_dims, + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + eigen_assert(array_size::value == desc.dimensions().size()); + + // If a tensor block dimensions covers a contiguous block of the underlying + // memory, we can skip block buffer memory allocation, and construct a block + // from existing `data` memory buffer. + // + // Example: (RowMajor layout) + // data_dims: [11, 12, 13, 14] + // desc.dimensions(): [1, 1, 3, 14] + // + // In this case we can construct a TensorBlock starting at + // `data + desc.offset()`, with a `desc.dimensions()` block sizes. + static const bool is_col_major = Layout == ColMajor; + + // Find out how many inner dimensions have a matching size. + int num_matching_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (data_dims[dim] != desc.dimensions()[dim]) break; + ++num_matching_inner_dims; + } + + // All the outer dimensions must be of size `1`, except a single dimension + // before the matching inner dimension (`3` in the example above). + bool can_use_direct_access = true; + for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (desc.dimension(dim) != 1) { + can_use_direct_access = false; + break; + } + } + + if (can_use_direct_access) { + const Scalar* block_start = data + desc.offset(); + return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions()); + + } else { + // Reuse destination buffer or allocate new buffer with scratch allocator. + const Storage storage = prepareStorage(desc, scratch); + + typedef internal::TensorBlockIO TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), data, desc.offset()); + TensorBlockIODst dst(storage.dimensions(), storage.strides(), storage.data()); + + TensorBlockIO::Copy(dst, src); + return storage.AsTensorMaterializedBlock(); + } + } + + private: + TensorBlockKind m_kind; + const Scalar* m_data; + Dimensions m_dimensions; + XprType m_expr; + bool m_valid_expr; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseUnaryBlock { + static constexpr bool NoArgBlockAccess = internal::is_void::value; + + public: + typedef std::conditional_t > + XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) + : m_arg_block(arg_block), m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + UnaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseBinaryBlock { + static constexpr bool NoArgBlockAccess = internal::is_void::value || + internal::is_void::value; + + public: + typedef std::conditional_t< + NoArgBlockAccess, void, + TensorCwiseBinaryOp > + XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, const RhsTensorBlock& right_block, const BinaryOp& functor) + : m_left_block(left_block), m_right_block(right_block), m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); } + + const Scalar* data() const { return NULL; } + + void cleanup() { + m_left_block.cleanup(); + m_right_block.cleanup(); + } + + private: + LhsTensorBlock m_left_block; + RhsTensorBlock m_right_block; + BinaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorUnaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from a block of the underlying type (this is a +// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). + +template +class TensorUnaryExprBlock { + typedef typename ArgTensorBlock::XprType ArgXprType; + static constexpr bool NoArgBlockAccess = internal::is_void::value; + + public: + typedef std::conditional_t::type> XprType; + + typedef typename XprScalar::type Scalar; + + TensorUnaryExprBlock(const ArgTensorBlock& arg_block, const BlockFactory& factory) + : m_arg_block(arg_block), m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { return m_factory.expr(m_arg_block.expr()); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// TensorTernaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from three blocks of the underlying type. + +template +class TensorTernaryExprBlock { + typedef typename Arg1TensorBlock::XprType Arg1XprType; + typedef typename Arg2TensorBlock::XprType Arg2XprType; + typedef typename Arg3TensorBlock::XprType Arg3XprType; + + static constexpr bool NoArgBlockAccess = internal::is_void::value || + internal::is_void::value || + internal::is_void::value; + + public: + typedef std::conditional_t::type> + XprType; + + typedef typename XprScalar::type Scalar; + + TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, const Arg2TensorBlock& arg2_block, + const Arg3TensorBlock& arg3_block, const BlockFactory& factory) + : m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); } + const Scalar* data() const { return NULL; } + void cleanup() { + m_arg1_block.cleanup(); + m_arg2_block.cleanup(); + m_arg3_block.cleanup(); + } + + private: + Arg1TensorBlock m_arg1_block; + Arg2TensorBlock m_arg2_block; + Arg3TensorBlock m_arg3_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// StridedLinearBufferCopy provides a method to copy data between two linear +// buffers with different strides, with optimized paths for scatter/gather. + +template +class StridedLinearBufferCopy { + typedef typename packet_traits::type Packet; + typedef typename unpacket_traits::half HalfPacket; + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + HasHalfPacket = static_cast(HalfPacketSize) < static_cast(PacketSize) + }; + + public: + // Specifying linear copy kind statically gives ~30% speedup for small sizes. + enum class Kind { + Linear = 0, // src_stride == 1 && dst_stride == 1 + Scatter = 1, // src_stride == 1 && dst_stride != 1 + FillLinear = 2, // src_stride == 0 && dst_stride == 1 + FillScatter = 3, // src_stride == 0 && dst_stride != 1 + Gather = 4, // dst_stride == 1 + Random = 5 // everything else + }; + + struct Dst { + Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + Scalar* data; + }; + + struct Src { + Src(IndexType o, IndexType s, const Scalar* d) : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + const Scalar* data; + }; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, const Src& src, const size_t count) { + Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data); + } + + private: + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset, + const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const IndexType src_offset, const IndexType src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + const Scalar* src = &src_data[src_offset]; + Scalar* dst = &dst_data[dst_offset]; + + if (!Vectorizable) { + for (Index i = 0; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + return; + } + + const IndexType vectorized_size = PacketSize * (count / PacketSize); + IndexType i = 0; + + if (kind == StridedLinearBufferCopy::Kind::Linear) { + // ******************************************************************** // + // Linear copy from `src` to `dst`. + const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize)); + eigen_assert(src_stride == 1 && dst_stride == 1); + for (; i < unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + Packet p = ploadu(src + i + j * PacketSize); + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i < vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pstoreu(dst + i, p); + } + if (HasHalfPacket) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { + HalfPacket p = ploadu(src + i); + pstoreu(dst + i, p); + i += HalfPacketSize; + } + } + for (; i < count; ++i) { + dst[i] = src[i]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { + // Scatter from `src` to `dst`. + eigen_assert(src_stride == 1 && dst_stride != 1); + for (; i < vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pscatter(dst + i * dst_stride, p, dst_stride); + } + if (HasHalfPacket) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { + HalfPacket p = ploadu(src + i); + pscatter(dst + i * dst_stride, p, dst_stride); + i += HalfPacketSize; + } + } + for (; i < count; ++i) { + dst[i * dst_stride] = src[i]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { + // Fill `dst` with value at `*src`. + eigen_assert(src_stride == 0 && dst_stride == 1); + + const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize)); + Scalar s = *src; + Packet p = pset1(s); + for (; i < unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i < vectorized_size; i += PacketSize) { + pstoreu(dst + i, p); + } + if (HasHalfPacket) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { + HalfPacket hp = pset1(s); + pstoreu(dst + i, hp); + i += HalfPacketSize; + } + } + for (; i < count; ++i) { + dst[i] = s; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) { + // Scatter `*src` into `dst`. + eigen_assert(src_stride == 0 && dst_stride != 1); + Scalar s = *src; + Packet p = pset1(s); + for (; i < vectorized_size; i += PacketSize) { + pscatter(dst + i * dst_stride, p, dst_stride); + } + if (HasHalfPacket) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { + HalfPacket hp = pset1(s); + pscatter(dst + i * dst_stride, hp, dst_stride); + i += HalfPacketSize; + } + } + for (; i < count; ++i) { + dst[i * dst_stride] = s; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Gather) { + // Gather from `src` into `dst`. + eigen_assert(dst_stride == 1); + for (; i < vectorized_size; i += PacketSize) { + Packet p = pgather(src + i * src_stride, src_stride); + pstoreu(dst + i, p); + } + if (HasHalfPacket) { + const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize); + if (i < vectorized_half_size) { + HalfPacket p = pgather(src + i * src_stride, src_stride); + pstoreu(dst + i, p); + i += HalfPacketSize; + } + } + for (; i < count; ++i) { + dst[i] = src[i * src_stride]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Random) { + // Random. + for (; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + } else { + eigen_assert(false); + } + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. +// It's possible to specify src->dst dimension mapping for the copy operation. +// Dimensions of `dst` specify how many elements have to be copied, for the +// `src` we need to know only stride to navigate through source memory buffer. + +template +class TensorBlockIO { + static constexpr bool IsColMajor = (Layout == ColMajor); + + typedef StridedLinearBufferCopy LinCopy; + + public: + typedef DSizes Dimensions; + typedef DSizes DimensionsMap; + + struct Dst { + Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0) + : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + struct Src { + Src(const Dimensions& src_strides, const Scalar* src, IndexType src_offset = 0) + : strides(src_strides), data(src), offset(src_offset) {} + + Dimensions strides; + const Scalar* data; + IndexType offset; + }; + + // Copies data to `dst` from `src`, using provided dimensions mapping: + // + // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] + // + // Returns the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst& dst, const Src& src, + const DimensionsMap& dst_to_src_dim_map) { + // Copy single scalar value from `src` to `dst`. + if (NumDims == 0) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Both `dst` and `src` must have contiguous innermost dimension. We also + // accept the special case with stride '0', because it's used as a trick to + // implement broadcasting. + { + int inner_dim = IsColMajor ? 0 : NumDims - 1; + EIGEN_UNUSED_VARIABLE(inner_dim); + eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); + eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); + } + + // Give a shorter name to `dst_to_src_dim_map`. + const DimensionsMap& dim_map = dst_to_src_dim_map; + + // Do not squeeze reordered inner dimensions. + int num_squeezable_dims = NumSqueezableInnerDims(dim_map); + + // NOTE: We find the innermost dimension (contiguous in memory) in the dst + // block, and we write data linearly into that dimension, reading it from + // the src. If dimensions are reordered, we might end up reading data from + // the src with `stride != 1`. + // + // NOTE: Random-Read/Linear-Write can be up to ~2X faster than + // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 + + // Find the innermost dimension in the dst whose size is not 1. This is the + // effective inner dim. + int num_size_one_inner_dims = 0; + for (int i = 0; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + if (dst.dims[dst_dim] != 1) break; + num_size_one_inner_dims++; + } + + // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. + if (num_size_one_inner_dims == NumDims) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Outermost dimension in the dst with `stride == 1` (contiguous in memory). + const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1; + + // Dimension in the src that corresponds to the dst innermost dimension. + const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; + + // Size of the innermost dimension (length of contiguous blocks of memory). + IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; + + // Squeeze multiple inner dims into one if they are contiguous in `dst` and + // `src` memory, so we can do less linear copy calls. + for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + const IndexType dst_stride = dst.strides[dst_dim]; + const IndexType src_stride = src.strides[dim_map[dst_dim]]; + if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { + dst_inner_dim_size *= dst.dims[dst_dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + // Setup strides to read data from `src` and write to `dst`. + IndexType input_offset = src.offset; + IndexType output_offset = dst.offset; + IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; + IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array it; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int idx = 0; // currently initialized iterator state index + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; + if (dst.dims[dst_dim] == 1) continue; + + it[idx].size = dst.dims[dst_dim]; + it[idx].input_stride = src.strides[dim_map[dst_dim]]; + it[idx].output_stride = dst.strides[dst_dim]; + + it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + + idx++; + } + + // Iterate copying data from src to dst. + const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); + +#define COPY_INNER_DIM(KIND) \ + IndexType num_copied = 0; \ + for (num_copied = 0; num_copied < block_total_size; num_copied += dst_inner_dim_size) { \ + LinCopy::template Run(typename LinCopy::Dst(output_offset, output_stride, dst.data), \ + typename LinCopy::Src(input_offset, input_stride, src.data), dst_inner_dim_size); \ + \ + for (int j = 0; j < idx; ++j) { \ + if (++it[j].count < it[j].size) { \ + input_offset += it[j].input_stride; \ + output_offset += it[j].output_stride; \ + break; \ + } \ + it[j].count = 0; \ + input_offset -= it[j].input_span; \ + output_offset -= it[j].output_span; \ + } \ + } \ + return num_copied; + + if (input_stride == 1 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::Linear); + } else if (input_stride == 1 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Kind::Scatter); + } else if (input_stride == 0 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::FillLinear); + } else if (input_stride == 0 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Kind::FillScatter); + } else if (output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::Gather); + } else { + COPY_INNER_DIM(LinCopy::Kind::Random); + } + +#undef COPY_INNER_DIM + } + + // Copy from `src` to `dst` with an identity src->dst dimension map. Returns + // the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, const Src& src) { + DimensionsMap dst_to_src_map; + for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; + return Copy(dst, src, dst_to_src_map); + } + + private: + struct BlockIteratorState { + BlockIteratorState() : size(0), count(0), input_stride(0), output_stride(0), input_span(0), output_span(0) {} + + IndexType size; + IndexType count; + IndexType input_stride; + IndexType output_stride; + IndexType input_span; + IndexType output_span; + }; + + // Compute how many inner dimensions it's allowed to squeeze when doing IO + // between two tensor blocks. It's safe to squeeze inner dimensions, only + // if they are not reordered. + static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { + int num_squeezable_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + if (dim_map[dim] != dim) break; + num_squeezable_dims++; + } + return num_squeezable_dims; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to +// a Tensor block defined by `desc`, backed by a memory buffer at `target`. +// +// Currently there is no way to write from a Tensor expression to a block of +// memory, if dimensions are reordered. If you need to do that, you should +// materialize a Tensor block expression into a memory buffer, and then use +// TensorBlockIO to copy data between two memory buffers with a custom +// `target->src` dimension map (see definition above). +// +// Also currently the innermost dimension of `target` must have a stride '1' +// (contiguous in memory). This restriction could be lifted with a `pscatter`, +// but in practice it's never needed, and there is a similar TensorBlockIO +// workaround for that. +// +// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO +// where `src` is a tensor expression. Explore if it is possible to rewrite IO +// to use expressions instead of pointers, and after that TensorBlockAssignment +// will become an alias to IO. +template +class TensorBlockAssignment { + // We will use coeff/packet path to evaluate block expressions. + typedef TensorEvaluator TensorBlockEvaluator; + + typedef DSizes Dimensions; + + enum { Vectorizable = packet_traits::Vectorizable, PacketSize = packet_traits::size }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { + for (IndexType i = 0; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { + typedef typename packet_traits::type Packet; + + const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize)); + const IndexType vectorized_size = PacketSize * (count / PacketSize); + IndexType i = 0; + + for (; i < unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + const IndexType idx = eval_offset + i + j * PacketSize; + Packet p = eval.template packet(idx); + pstoreu(target + i + j * PacketSize, p); + } + } + + for (; i < vectorized_size; i += PacketSize) { + Packet p = eval.template packet(eval_offset + i); + pstoreu(target + i, p); + } + + for (; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + public: + struct Target { + Target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data, + IndexType target_offset = 0) + : dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + static Target target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data, + IndexType target_offset = 0) { + return Target(target_dims, target_strides, target_data, target_offset); + } + + template + static Target target(const DSizes& target_dims, + const DSizes& target_strides, Scalar* target_data, + IndexType target_offset = 0) { + // DSizes constructor will do index type promotion if it's safe. + return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target& target, const TensorBlockExpr& expr) { + // Prepare evaluator for block expression. + DefaultDevice default_device; + TensorBlockEvaluator eval(expr, default_device); + + // Tensor block expression dimension should match destination dimensions. + eigen_assert(dimensions_match(target.dims, eval.dimensions())); + + static const int Layout = TensorBlockEvaluator::Layout; + static const bool is_col_major = Layout == ColMajor; + + // Initialize output inner dimension size based on a layout. + const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); + const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; + IndexType output_inner_dim_size = target.dims[inner_dim_idx]; + + // Target inner dimension stride must be '1'. + eigen_assert(target.strides[inner_dim_idx] == 1); + + // Squeeze multiple inner dims into one if they are contiguous in `target`. + IndexType num_squeezed_dims = 0; + for (Index i = 1; i < NumDims; ++i) { + const Index dim = is_col_major ? i : NumDims - i - 1; + const IndexType target_stride = target.strides[dim]; + + if (output_inner_dim_size == target_stride) { + output_inner_dim_size *= target.dims[dim]; + num_squeezed_dims++; + } else { + break; + } + } + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + + int idx = 0; // currently initialized iterator state index + for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { + const Index dim = is_col_major ? i + 1 : NumDims - i - 2; + + it[idx].count = 0; + it[idx].size = target.dims[dim]; + it[idx].output_stride = target.strides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // We read block expression from the beginning, and start writing data to + // `target` at given offset. + IndexType input_offset = 0; + IndexType output_offset = target.offset; + + // Iterate copying data from `eval` to `target`. + for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { + // Assign to `target` at current offset. + InnerDimAssign::Run( + target.data + output_offset, output_inner_dim_size, eval, input_offset); + + // Move input offset forward by the number of assigned coefficients. + input_offset += output_inner_dim_size; + + // Update index. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + } + + private: + struct BlockIteratorState { + BlockIteratorState() : count(0), size(0), output_stride(0), output_span(0) {} + + IndexType count; + IndexType size; + IndexType output_stride; + IndexType output_span; + }; +}; + +// -------------------------------------------------------------------------- // + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h new file mode 100644 index 00000000000..9ef4bbce092 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -0,0 +1,1001 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H +#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorBroadcasting + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor broadcasting class. + * + * + */ +namespace internal { +template +struct traits> : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorBroadcastingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorBroadcastingOp type; +}; + +template +struct is_input_scalar { + static const bool value = false; +}; +template <> +struct is_input_scalar> { + static const bool value = true; +}; +template +struct is_input_scalar> { + static constexpr bool value = (Sizes::total_size == 1); +}; + +} // end namespace internal + +template +class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) + : m_xpr(expr), m_broadcast(broadcast) {} + + EIGEN_DEVICE_FUNC const Broadcast& broadcast() const { return m_broadcast; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Broadcast m_broadcast; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorBroadcastingOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator won't be + // standard layout; + bool isCopy, nByOne, oneByN; + + public: + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + RawAccess = false + }; + static constexpr int Layout = TensorEvaluator::Layout; + + typedef std::remove_const_t ScalarNoConst; + + // We do block based broadcasting using a trick with 2x tensor rank and 0 + // strides. See block method implementation for details. + typedef DSizes BroadcastDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : isCopy(false), + nByOne(false), + oneByN(false), + m_device(device), + m_broadcast(op.broadcast()), + m_impl(op.expression(), device) { + // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar + // and store the result in a scalar. Instead one should reshape the scalar into a N-D + // tensor with N >= 1 of 1 element first and then broadcast. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + const InputDimensions& input_dims = m_impl.dimensions(); + isCopy = true; + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i] * m_broadcast[i]; + if (m_broadcast[i] != 1) { + isCopy = false; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + + if (input_dims[0] == 1) { + oneByN = true; + for (int i = 1; i < NumDims; ++i) { + if (m_broadcast[i] != 1) { + oneByN = false; + break; + } + } + } else if (input_dims[NumDims - 1] == 1) { + nByOne = true; + for (int i = 0; i < NumDims - 1; ++i) { + if (m_broadcast[i] != 1) { + nByOne = false; + break; + } + } + } + + // Handle special format like NCHW, its input shape is '[1, N..., 1]' and + // broadcast shape is '[N, 1..., N]' + if (!oneByN && !nByOne) { + if (input_dims[0] == 1 && input_dims[NumDims - 1] == 1 && NumDims > 2) { + nByOne = true; + oneByN = true; + for (int i = 1; i < NumDims - 1; ++i) { + if (m_broadcast[i] != 1) { + nByOne = false; + oneByN = false; + break; + } + } + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { + if (internal::is_input_scalar>::value) { + return m_impl.coeff(0); + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + if (isCopy) { + return m_impl.coeff(index); + } else { + return coeffColMajor(index); + } + } else { + if (isCopy) { + return m_impl.coeff(index); + } else { + return coeffRowMajor(index); + } + } + } + + // TODO: attempt to speed this up. The integer divisions and modulo are slow + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + inputIndex += index; + } else { + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const { + return m_impl.coeff(indexColMajor(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims - 1]); + inputIndex += index; + } else { + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[NumDims - 1]); + } + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const { + return m_impl.coeff(indexRowMajor(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + if (internal::is_input_scalar>::value) { + return internal::pset1(m_impl.coeff(0)); + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + if (isCopy) { +#ifdef EIGEN_GPU_COMPILE_PHASE + // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing + // unaligned loads here. The reason is unclear though. + return m_impl.template packet(index); +#else + return m_impl.template packet(index); +#endif + } else if (oneByN && !nByOne) { + return packetNByOne(index); + } else if (!oneByN && nByOne) { + return packetOneByN(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); + } else { + return packetColMajor(index); + } + } else { + if (isCopy) { +#ifdef EIGEN_GPU_COMPILE_PHASE + // See above. + return m_impl.template packet(index); +#else + return m_impl.template packet(index); +#endif + } else if (oneByN && !nByOne) { + return packetOneByN(index); + } else if (!oneByN && nByOne) { + return packetNByOne(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); + } else { + return packetRowMajor(index); + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + Index startDim, endDim; + Index inputIndex, outputOffset, batchedIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + startDim = NumDims - 1; + endDim = 1; + } else { + startDim = 0; + endDim = NumDims - 2; + } + + batchedIndex = index % m_outputStrides[startDim]; + inputIndex = batchedIndex / m_outputStrides[endDim]; + outputOffset = batchedIndex % m_outputStrides[endDim]; + + if (outputOffset + PacketSize <= m_outputStrides[endDim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[endDim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + ++inputIndex; + inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex); + values[i] = m_impl.coeff(inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const { + // Consider the flattened tensor [v0, ..., vN], + // Concatenates m_broadcast[dim] copies, + // [v0, ..., vN, v0, ..., vN, ... ] + // with dim == NumDims - 1 for col-major, dim == 0 for row-major. + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + // Size of flattened tensor. + const Index M = + (static_cast(Layout) == static_cast(ColMajor)) ? m_inputStrides[NumDims - 1] : m_inputStrides[0]; + Index inputIndex = index % M; + if (inputIndex + PacketSize <= M) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + if (inputIndex > M - 1) { + inputIndex = 0; + } + values[i] = m_impl.coeff(inputIndex++); + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const { + // Consider the flattened tensor [v0, ..., vN], + // Interleaves m_broadcast[dim] copies, + // [v0, v0, ..., v1, v1, ..., vN, vN, ... ] + // with dim == 0 for col-major, dim == NumDims - 1 for row-major. + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + const Index M = + (static_cast(Layout) == static_cast(ColMajor)) ? m_broadcast[0] : m_broadcast[NumDims - 1]; + + Index inputIndex = index / M; + Index outputOffset = index % M; + if (outputOffset + PacketSize <= M) { + return internal::pset1(m_impl.coeff(inputIndex)); + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + if (outputOffset < M) { + values[i] = m_impl.coeff(inputIndex); + ++outputOffset; + } else { + values[i] = m_impl.coeff(++inputIndex); + outputOffset = 1; // Next offset. + } + } + return internal::pload(values); + } + } + + // Ignore the LoadMode and always use unaligned loads since we can't guarantee + // the alignment at compile time. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + innermostLoc = index; + } else { + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize; ++i) { + if (innermostLoc + i < m_impl.dimensions()[0]) { + values[i] = m_impl.coeff(inputIndex + i); + } else { + values[i] = coeffColMajor(originalIndex + i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims - 1]); + innermostLoc = index; + } else { + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[NumDims - 1]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims - 1]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize; ++i) { + if (innermostLoc + i < m_impl.dimensions()[NumDims - 1]) { + values[i] = m_impl.coeff(inputIndex + i); + } else { + values[i] = coeffRowMajor(originalIndex + i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = TensorOpCost::AddCost(); + if (!isCopy && NumDims > 0) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + compute_cost += TensorOpCost::DivCost(); + if (internal::index_statically_eq(i, 1)) { + compute_cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else { + if (!internal::index_statically_eq(i, 1)) { + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::ModCost() + TensorOpCost::AddCost(); + } + } + compute_cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large + // tensors. But this might need further tuning. + const size_t target_size = m_device.firstLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + m_impl.getResourceRequirements(), internal::TensorBlockResourceRequirements::skewed(target_size)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + BlockBroadcastingParams params = blockBroadcastingParams(desc); + + if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) { + return emptyBlock(); + } + + // Prepare storage for the materialized broadcasting result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + ScalarNoConst* materialized_output = block_storage.data(); + + // We potentially will need to materialize input blocks. + size_t materialized_input_size = 0; + ScalarNoConst* materialized_input = NULL; + + // Initialize block broadcating iterator state for outer dimensions (outer + // with regard to bcast dimension). Dimension in this array are always in + // inner_most -> outer_most order (col major layout). + array it; + int idx = 0; + + for (int i = params.inner_dim_count + 1; i < NumDims; ++i) { + const Index dim = IsColMajor ? i : NumDims - 1 - i; + it[idx].size = params.output_dims[dim]; + it[idx].count = 0; + it[idx].output_stride = m_outputStrides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // Write output into the beginning of `materialized_output`. + Index output_offset = 0; + + // We will fill output block by broadcasting along the bcast dim, and + // iterating over outer dimension. + const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize(); + + for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) { + ScalarNoConst* bcast_output = materialized_output + num_output_coeffs; + Index bcast_offset = desc.offset() + output_offset; + + // Broadcast along the bcast dimension. + num_output_coeffs += BroadcastBlockAlongBcastDim(params, bcast_offset, scratch, bcast_output, &materialized_input, + &materialized_input_size); + + // Switch to the next outer dimension. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + const TensorEvaluator& impl() const { return m_impl; } + + Broadcast functor() const { return m_broadcast; } + + private: + static constexpr bool IsColMajor = static_cast(Layout) == static_cast(ColMajor); + + // We will build a general case block broadcasting on top of broadcasting + // primitive that will do broadcasting only for the inner dimension(s) along + // the first dimension smaller than the input size (it's called `bcast_dim`). + // + // Example: + // dim: 0 1 2 (ColMajor) + // input size: [9, 3, 6] + // block size: [9, 2, 6] + // + // We will compute broadcasted block by iterating over the outer dimensions + // before `bcast_dim` (only dimension `2` in this example) and computing + // broadcasts along the `bcast_dim` (dimension `1` in this example). + + // BlockBroadcastingParams holds precomputed parameters for broadcasting a + // single block along the broadcasting dimension. Sizes and strides along the + // `bcast_dim` might be invalid, they will be adjusted later in + // `BroadcastBlockAlongBcastDim`. + struct BlockBroadcastingParams { + Dimensions input_dims; // input expression dimensions + Dimensions output_dims; // output block sizes + Dimensions output_strides; // output block strides + + int inner_dim_count; // count inner dimensions matching in size + int bcast_dim; // broadcasting dimension index + Index bcast_dim_size; // broadcasting dimension size + Index inner_dim_size; // inner dimensions size + + // Block sizes and strides for the input block where all dimensions before + // `bcast_dim` are equal to `1`. + Dimensions input_block_sizes; + Dimensions input_block_strides; + + // Block sizes and strides for blocks with extra dimensions and strides `0`. + BroadcastDimensions bcast_block_sizes; + BroadcastDimensions bcast_block_strides; + BroadcastDimensions bcast_input_strides; + }; + + struct BlockBroadcastingIteratorState { + Index size; + Index count; + Index output_stride; + Index output_span; + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc& desc) const { + BlockBroadcastingParams params; + + params.input_dims = Dimensions(m_impl.dimensions()); + + // Output block sizes and strides. + params.output_dims = desc.dimensions(); + params.output_strides = internal::strides(params.output_dims); + + // Find the broadcasting dimension (first dimension with output size smaller + // that the input size). + params.bcast_dim = 0; + params.bcast_dim_size = 1; + params.inner_dim_size = 1; + + // Count the number of inner dimensions that have the same size in the block + // and in the broadcast expression. + params.inner_dim_count = 0; + + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + + if (params.output_dims[dim] == m_dimensions[dim]) { + params.inner_dim_size *= params.output_dims[dim]; + ++params.inner_dim_count; + continue; + } + + // First non-matching dimension is the broadcasting dimension. + eigen_assert(params.output_dims[dim] < m_dimensions[dim]); + params.bcast_dim = dim; + params.bcast_dim_size = params.output_dims[dim]; + break; + } + + // Calculate the input block size for looking into the input. + for (int i = 0; i < params.inner_dim_count; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + params.input_block_sizes[dim] = params.input_dims[dim]; + } + for (int i = params.inner_dim_count; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + params.input_block_sizes[dim] = 1; + } + params.input_block_strides = internal::strides(params.input_block_sizes); + + // Broadcast with the 0-stride trick: Create 1 extra dim for each + // broadcast, set the input stride to 0. + // + // When ColMajor: + // + // - bcast_block_sizes: + // [d_0, b_0, d_1, b_1, ...] + // + // - bcast_block_strides: + // [output_block_strides[0], output_block_strides[0] * d_0, + // output_block_strides[1], output_block_strides[1] * d_1, + // ...] + // + // - bcast_input_strides: + // [input_block_strides[0], 0, + // input_block_strides[1], 0, + // ...]. + // + for (int i = 0; i < params.inner_dim_count; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + + const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1; + const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1; + + params.bcast_block_sizes[copy_dim] = params.input_dims[dim]; + params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim]; + params.bcast_block_strides[copy_dim] = params.output_strides[dim]; + params.bcast_block_strides[broadcast_dim] = params.output_strides[dim] * params.input_dims[dim]; + params.bcast_input_strides[copy_dim] = params.input_block_strides[dim]; + params.bcast_input_strides[broadcast_dim] = 0; + } + + for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) { + const int dim = IsColMajor ? i : 2 * NumDims - i - 1; + params.bcast_block_sizes[dim] = 1; + params.bcast_block_strides[dim] = 0; + params.bcast_input_strides[dim] = 0; + } + + return params; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const { + DSizes dimensions; + for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; + return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim( + BlockBroadcastingParams params, Index bcast_offset, TensorBlockScratch& scratch, + ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const { + if (params.bcast_dim_size == 1) { + // We just need one block read using the ready-set values above. + return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + + } else if (params.input_dims[params.bcast_dim] == 1) { + // Broadcast bcast dimension (< NumDims) by bcast_dim_size. + const int broadcast_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2; + + params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = params.output_strides[params.bcast_dim]; + + return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + + } else { + // Keep track of the total number of the coefficients written to the + // output block. + Index num_output_coeffs = 0; + + // The general case. Let's denote the output block as + // + // x[..., a:a+bcast_dim_size, :, ..., :] + // + // where a:a+bcast_dim_size is a slice on the bcast_dim dimension + // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3 + // sub-blocks: + // + // (1) a:b, where b is the smallest multiple of + // input_dims[bcast_dim_start] in [a, a+bcast_dim_size]. + // + // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start] + // in [a, a+bcast_dim_size]. + // + // (3) c:a+bcast_dim_size . + // + // Or, when b and c do not exist, we just need to process the whole block + // together. + + // Find a. + const Index bcast_dim_left_index = bcast_offset / m_outputStrides[params.bcast_dim]; + + // Find b and c. + const Index input_bcast_dim_size = params.input_dims[params.bcast_dim]; + + // First multiple after a. This is b when <= bcast_dim_left_index + + // bcast_dim_size. + const Index first_multiple = + numext::div_ceil(bcast_dim_left_index, input_bcast_dim_size) * input_bcast_dim_size; + + if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) { + // b exists, so does c. Find it. + const Index last_multiple = + (bcast_dim_left_index + params.bcast_dim_size) / input_bcast_dim_size * input_bcast_dim_size; + const int copy_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1; + const int broadcast_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2; + + if (first_multiple > bcast_dim_left_index) { + const Index head_size = first_multiple - bcast_dim_left_index; + params.input_block_sizes[params.bcast_dim] = head_size; + params.bcast_block_sizes[copy_bcast_dim] = head_size; + params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = 1; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim]; + + num_output_coeffs += + BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + } + if (first_multiple < last_multiple) { + params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size; + params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size; + params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = (last_multiple - first_multiple) / input_bcast_dim_size; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim]; + const Index offset = (first_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim]; + + num_output_coeffs += + BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch, + materialized_output, materialized_input, materialized_input_size); + } + if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) { + const Index tail_size = bcast_dim_left_index + params.bcast_dim_size - last_multiple; + params.input_block_sizes[params.bcast_dim] = tail_size; + params.bcast_block_sizes[copy_bcast_dim] = tail_size; + params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = 1; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim]; + const Index offset = (last_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim]; + + num_output_coeffs += + BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch, + materialized_output, materialized_input, materialized_input_size); + } + } else { + // b and c do not exist. + const int copy_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1; + params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size; + params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size; + params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim]; + + num_output_coeffs += + BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, + params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + } + + return num_output_coeffs; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock( + const Dimensions& input_block_sizes, const Dimensions& input_block_strides, + const BroadcastDimensions& bcast_block_sizes, const BroadcastDimensions& bcast_block_strides, + const BroadcastDimensions& bcast_input_strides, Index bcast_offset, Index offset, TensorBlockScratch& scratch, + ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const { + // ---------------------------------------------------------------------- // + // Tensor block descriptor for reading block from the input. + const Index input_offset = bcast_offset + offset; + TensorBlockDesc input_desc(IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset), + input_block_sizes); + + ArgTensorBlock input_block = m_impl.block(input_desc, scratch); + + // ---------------------------------------------------------------------- // + // Materialize input block into a temporary memory buffer only if it's not + // already available in the arg block. + const ScalarNoConst* input_buffer = NULL; + + if (input_block.data() != NULL) { + // Input block already has raw data, there is no need to materialize it. + input_buffer = input_block.data(); + + } else { + // Otherwise we have to do block assignment into a temporary buffer. + + // Maybe reuse previously allocated buffer, or allocate a new one with a + // scratch allocator. + const size_t input_total_size = input_block_sizes.TotalSize(); + if (*materialized_input == NULL || *materialized_input_size < input_total_size) { + *materialized_input_size = input_total_size; + void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar)); + *materialized_input = static_cast(mem); + } + + typedef internal::TensorBlockAssignment + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target(input_block_sizes, input_block_strides, *materialized_input), + input_block.expr()); + + input_buffer = *materialized_input; + } + + // ---------------------------------------------------------------------- // + // Copy data from materialized input block to the materialized output, using + // given broadcast strides (strides with zeroes). + typedef internal::TensorBlockIO TensorBlockIO; + + typename TensorBlockIO::Src src(bcast_input_strides, input_buffer); + typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, materialized_output + offset); + + return TensorBlockIO::Copy(dst, src); + } + + protected: + const Device EIGEN_DEVICE_REF m_device; + const std::remove_reference_t m_broadcast; + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h new file mode 100644 index 00000000000..5db563d601b --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -0,0 +1,474 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorKChippingReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. + * + * + */ + +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions - 1; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorChippingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorChippingOp type; +}; + +template +struct DimensionId { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + EIGEN_UNUSED_VARIABLE(dim); + eigen_assert(dim == DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { return DimId; } +}; +template <> +struct DimensionId { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { eigen_assert(dim >= 0); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { return actual_dim; } + + private: + const DenseIndex actual_dim; +}; + +} // end namespace internal + +template +class TensorChippingOp : public TensorBase > { + public: + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + eigen_assert(dim < XprType::NumDimensions && dim >= 0 && "Chip_Dim_out_of_range"); + } + + EIGEN_DEVICE_FUNC const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC const Index dim() const { return m_dim.actualDim(); } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp) + + protected: + typename XprType::Nested m_xpr; + const Index m_offset; + const internal::DimensionId m_dim; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorChippingOp XprType; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + static constexpr int NumDims = NumInputDims - 1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static constexpr int Layout = TensorEvaluator::Layout; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets. + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + // Chipping of outer-most dimension is a trivial operation, because we can + // read and write directly from the underlying tensor using single offset. + IsOuterChipping = (Layout == ColMajor && DimId == NumInputDims - 1) || (Layout == RowMajor && DimId == 0), + // Chipping inner-most dimension. + IsInnerChipping = (Layout == ColMajor && DimId == 0) || (Layout == RowMajor && DimId == NumInputDims - 1), + // Prefer block access if the underlying expression prefers it, otherwise + // only if chipping is not trivial. + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || !IsOuterChipping, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef internal::TensorBlockDescriptor ArgTensorBlockDesc; + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { + EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(NumInputDims > m_dim.actualDim()); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); + + int j = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (i != m_dim.actualDim()) { + m_dimensions[j] = input_dims[i]; + ++j; + } + } + + m_stride = 1; + m_inputStride = 1; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < m_dim.actualDim(); ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } else { + for (int i = NumInputDims - 1; i > m_dim.actualDim(); --i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } + m_inputStride *= input_dims[m_dim.actualDim()]; + m_inputOffset = m_stride * op.offset(); + + // Check if chipping is effectively inner or outer: products of dimensions + // before or after the chipped dimension is `1`. + Index after_chipped_dim_product = 1; + for (int i = static_cast(m_dim.actualDim()) + 1; i < NumInputDims; ++i) { + after_chipped_dim_product *= input_dims[i]; + } + + Index before_chipped_dim_product = 1; + for (int i = 0; i < m_dim.actualDim(); ++i) { + before_chipped_dim_product *= input_dims[i]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_isEffectivelyInnerChipping = before_chipped_dim_product == 1; + m_isEffectivelyOuterChipping = after_chipped_dim_product == 1; + } else { + m_isEffectivelyInnerChipping = after_chipped_dim_product == 1; + m_isEffectivelyOuterChipping = before_chipped_dim_product == 1; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_impl.coeff(srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + if (isInnerChipping()) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + Index inputIndex = index * m_inputStride + m_inputOffset; + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = m_impl.coeff(inputIndex); + inputIndex += m_inputStride; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } else if (isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + return m_impl.template packet(index + m_inputOffset); + } else { + const Index idx = index / m_stride; + const Index rem = index - idx * m_stride; + if (rem + PacketSize <= m_stride) { + Index inputIndex = idx * m_inputStride + m_inputOffset + rem; + return m_impl.template packet(inputIndex); + } else { + // Cross the stride boundary. Fallback to slow path. + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index); + ++index; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double cost = 0; + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) { + cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + cost += TensorOpCost::AddCost(); + } else { + cost += 3 * TensorOpCost::MulCost() + TensorOpCost::DivCost() + 3 * TensorOpCost::AddCost(); + } + + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + const Index chip_dim = m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) : i > chip_dim ? desc.dimension(i - 1) : 1; + } + + ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims); + + // Try to reuse destination buffer for materializing argument block. + if (desc.HasDestinationBuffer()) { + DSizes arg_destination_strides; + for (int i = 0; i < NumInputDims; ++i) { + arg_destination_strides[i] = i < chip_dim ? desc.destination().strides()[i] + : i > chip_dim ? desc.destination().strides()[i - 1] + : 0; // for dimensions of size `1` stride should never be used. + } + + arg_desc.template AddDestinationBuffer(desc.destination().template data(), + arg_destination_strides); + } + + ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + + if (arg_block.data() != NULL) { + // Forward argument block buffer if possible. + return TensorBlock(arg_block.kind(), arg_block.data(), desc.dimensions()); + + } else { + // Assign argument block expression to a buffer. + + // Prepare storage for the materialized chipping result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + + typedef internal::TensorBlockAssignment + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target(arg_desc.dimensions(), internal::strides(arg_desc.dimensions()), + block_storage.data()), + arg_block.expr()); + + return block_storage.AsTensorMaterializedBlock(); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); + if (isOuterChipping() && result) { + return result + m_inputOffset; + } else { + return NULL; + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex; + if (isInnerChipping()) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + inputIndex = index * m_inputStride + m_inputOffset; + } else if (isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer + // division. + eigen_assert(m_stride > index); + inputIndex = index + m_inputOffset; + } else { + const Index idx = index / m_stride; + inputIndex = idx * m_inputStride + m_inputOffset; + index -= idx * m_stride; + inputIndex += index; + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const { + return IsInnerChipping || m_isEffectivelyInnerChipping; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const { + return IsOuterChipping || m_isEffectivelyOuterChipping; + } + + Dimensions m_dimensions; + Index m_stride; + Index m_inputOffset; + Index m_inputStride; + TensorEvaluator m_impl; + const internal::DimensionId m_dim; + const Device EIGEN_DEVICE_REF m_device; + + // If product of all dimensions after or before the chipped dimension is `1`, + // it is effectively the same as chipping innermost or outermost dimension. + bool m_isEffectivelyInnerChipping; + bool m_isEffectivelyOuterChipping; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorChippingOp XprType; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + static constexpr int NumDims = NumInputDims - 1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + if (this->isInnerChipping()) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(this->m_stride == 1); + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + internal::pstore(values, x); + Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->m_impl.coeffRef(inputIndex) = values[i]; + inputIndex += this->m_inputStride; + } + } else if (this->isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer division. + eigen_assert(this->m_stride > index); + this->m_impl.template writePacket(index + this->m_inputOffset, x); + } else { + const Index idx = index / this->m_stride; + const Index rem = index - idx * this->m_stride; + if (rem + PacketSize <= this->m_stride) { + const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; + this->m_impl.template writePacket(inputIndex, x); + } else { + // Cross stride boundary. Fallback to slow path. + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index) = values[i]; + ++index; + } + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + + const Index chip_dim = this->m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) : i > chip_dim ? desc.dimension(i - 1) : 1; + } + + typedef TensorReshapingOp, const typename TensorBlock::XprType> TensorBlockExpr; + + typedef internal::TensorBlockAssignment TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target(input_block_dims, internal::strides(this->m_impl.dimensions()), + this->m_impl.data(), this->srcCoeff(desc.offset())), + block.expr().reshape(input_block_dims)); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h new file mode 100644 index 00000000000..e9c130dceb2 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -0,0 +1,354 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorConcatenationOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor concatenation class. + * + * + */ +namespace internal { +template +struct traits > { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = traits::Layout; + enum { Flags = 0 }; + typedef std::conditional_t::val, + typename traits::PointerType, typename traits::PointerType> + PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorConcatenationOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorConcatenationOp type; +}; + +} // end namespace internal + +template +class TensorConcatenationOp : public TensorBase, WriteAccessors> { + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& lhsExpression() const { + return m_lhs_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& rhsExpression() const { + return m_rhs_xpr; + } + + EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp) + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Axis m_axis; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorConcatenationOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + static constexpr int RightNumDims = + internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = + TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout) || + NumDims == 1), + YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(0 <= m_axis && m_axis < NumDims); + const Dimensions& lhs_dims = m_leftImpl.dimensions(); + const Dimensions& rhs_dims = m_rightImpl.dimensions(); + { + int i = 0; + for (; i < m_axis; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. + eigen_assert(rhs_dims[i] > 0); + m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; + for (++i; i < NumDims; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_leftStrides[0] = 1; + m_rightStrides[0] = 1; + m_outputStrides[0] = 1; + + for (int j = 1; j < NumDims; ++j) { + m_leftStrides[j] = m_leftStrides[j - 1] * lhs_dims[j - 1]; + m_rightStrides[j] = m_rightStrides[j - 1] * rhs_dims[j - 1]; + m_outputStrides[j] = m_outputStrides[j - 1] * m_dimensions[j - 1]; + } + } else { + m_leftStrides[NumDims - 1] = 1; + m_rightStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + + for (int j = NumDims - 2; j >= 0; --j) { + m_leftStrides[j] = m_leftStrides[j + 1] * lhs_dims[j + 1]; + m_rightStrides[j] = m_rightStrides[j + 1] * rhs_dims[j + 1]; + m_outputStrides[j] = m_outputStrides[j + 1] * m_dimensions[j + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. + // See CL/76180724 comments for more ideas. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + // Collect dimension-wise indices (subs). + array subs; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[NumDims - 1] = index; + } + + const Dimensions& left_dims = m_leftImpl.dimensions(); + if (subs[m_axis] < left_dims[m_axis]) { + Index left_index; + if (static_cast(Layout) == static_cast(ColMajor)) { + left_index = subs[0]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } else { + left_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i >= 0; --i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } + return m_leftImpl.coeff(left_index); + } else { + subs[m_axis] -= left_dims[m_axis]; + const Dimensions& right_dims = m_rightImpl.dimensions(); + Index right_index; + if (static_cast(Layout) == static_cast(ColMajor)) { + right_index = subs[0]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } else { + right_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i >= 0; --i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } + return m_rightImpl.coeff(right_index); + } + } + + // TODO(phli): Add a real vectorization. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + const int packetSize = PacketType::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost() + TensorOpCost::ModCost()); + const double lhs_size = m_leftImpl.dimensions().TotalSize(); + const double rhs_size = m_rightImpl.dimensions().TotalSize(); + return (lhs_size / (lhs_size + rhs_size)) * m_leftImpl.costPerCoeff(vectorized) + + (rhs_size / (lhs_size + rhs_size)) * m_rightImpl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_leftStrides; + array m_rightStrides; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; + const Axis m_axis; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorConcatenationOp XprType; + typedef typename Base::Dimensions Dimensions; + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = + TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) : Base(op, device) { + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + // Collect dimension-wise indices (subs). + array subs; + for (int i = Base::NumDims - 1; i > 0; --i) { + subs[i] = index / this->m_outputStrides[i]; + index -= subs[i] * this->m_outputStrides[i]; + } + subs[0] = index; + + const Dimensions& left_dims = this->m_leftImpl.dimensions(); + if (subs[this->m_axis] < left_dims[this->m_axis]) { + Index left_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i]; + } + return this->m_leftImpl.coeffRef(left_index); + } else { + subs[this->m_axis] -= left_dims[this->m_axis]; + const Dimensions& right_dims = this->m_rightImpl.dimensions(); + Index right_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i]; + } + return this->m_rightImpl.coeffRef(right_index); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + const int packetSize = PacketType::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); + + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + coeffRef(index + i) = values[i]; + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h new file mode 100644 index 00000000000..6f1c9744ee0 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -0,0 +1,962 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorContraction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor contraction class. + * + * + */ +namespace internal { + +template +struct traits> { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename gebp_traits, + std::remove_const_t>::ResScalar Scalar; + + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + + // From NumDims below. + static constexpr int NumDimensions = + traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; + static constexpr int Layout = traits::Layout; + typedef std::conditional_t::val, + typename traits::PointerType, typename traits::PointerType> + PointerType; + + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorContractionOp& type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorContractionOp type; +}; + +template +struct traits< + TensorEvaluator, Device_>> { + typedef Indices_ Indices; + typedef LeftArgType_ LeftArgType; + typedef RightArgType_ RightArgType; + typedef OutputKernelType_ OutputKernelType; + typedef Device_ Device; + + // From NumDims below. + static constexpr int NumDimensions = + traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; +}; + +// Helper class to allocate and deallocate temporary memory for packed buffers. +template +struct TensorContractionBlockMemAllocator { + typedef void* BlockMemHandle; + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, const Index bk, const Index bn, + LhsScalar** lhs_block, RhsScalar** rhs_block) { + eigen_assert(lhs_block); + eigen_assert(rhs_block); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + char* block_mem = static_cast(d.allocate(sz.lhs_size + sz.rhs_size)); + *lhs_block = static_cast(static_cast(block_mem)); + *rhs_block = static_cast(static_cast(block_mem + sz.lhs_size)); + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices(Device& d, const Index bm, const Index bk, const Index bn, + const Index num_lhs, const Index num_rhs, + const Index num_slices, std::vector* lhs_blocks, + std::vector* rhs_blocks) { + eigen_assert(num_slices > 0); + eigen_assert(num_lhs >= 0 && num_rhs >= 0); + eigen_assert(num_lhs == 0 || lhs_blocks); + eigen_assert(num_rhs == 0 || rhs_blocks); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + void* block_mem = d.allocate((num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices); + eigen_assert(block_mem); + char* mem = static_cast(block_mem); + + for (Index x = 0; x < num_slices; x++) { + if (num_lhs > 0) lhs_blocks[x].resize(num_lhs); + for (Index m = 0; m < num_lhs; m++) { + lhs_blocks[x][m] = static_cast(static_cast(mem)); + mem += sz.lhs_size; + } + if (num_rhs > 0) rhs_blocks[x].resize(num_rhs); + for (Index n = 0; n < num_rhs; n++) { + rhs_blocks[x][n] = static_cast(static_cast(mem)); + mem += sz.rhs_size; + } + } + + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + d.deallocate(handle); + } + + private: + struct BlockSizes { + Index lhs_size; + Index rhs_size; + }; + EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, const Index bk, const Index bn) { + Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + BlockSizes sz; + sz.lhs_size = numext::div_ceil(bm * bk * sizeof(LhsScalar), align) * align; + sz.rhs_size = numext::div_ceil(bn * bk * sizeof(RhsScalar), align) * align; + return sz; + } +}; + +// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in +// ColMajor storage order. This property is guaranteed by the +// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack +// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix +// multiplication for these blocks. Default tensor contraction uses +// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see +// GeneralBlocPanelKernel.h for details). +// +// By specializing contraction kernels we can use other low level libraries to +// perform matrix multiplication, and still rely on Eigen contraction evaluator. +// This also includes full support in TensorContractionThreadPool, assuming that +// underlying gemm do not use it's own threading. +// +// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of +// multiplication, lhs tensor and rhs tensor respectively. +// +// - StorageIndex - index type for the tensor expressions. In practice almost +// always is Eigen::Index. +// +// - OutputMapper provides access to the memory of the output matrix. In +// practice it's always column major blas_data_mapper (it must be of ResScalar +// type). +// +// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional +// view into the Lhs/Rhs tensor expressions. In practice it's +// TensorContractionInputMapper, or some specialization of it based on the +// type of tensor expression (e.g. TensorImagePatchOp has optimized input +// mapper). +template +struct TensorContractionKernel { + // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C` + // (otherwise beta should be always equal to 1). + enum { HasBeta = false }; + + EIGEN_DEVICE_FUNC TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_, StorageIndex bm_, + StorageIndex bk_, StorageIndex bn_) + : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {} + + // Pack blocks of Lhs and Rhs into contiguous blocks in memory. + typedef LhsScalar* LhsBlock; + typedef RhsScalar* RhsBlock; + + // Packed Lhs/Rhs block memory allocator. + typedef TensorContractionBlockMemAllocator BlockMemAllocator; + typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle; + + typedef typename internal::gebp_traits Traits; + + typedef internal::gemm_pack_lhs + LhsPacker; + + typedef internal::gemm_pack_rhs + RhsPacker; + + typedef internal::gebp_kernel + GebpKernel; + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, RhsBlock* rhs_block) { + return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block); + } + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices(Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs, + const StorageIndex num_slices, std::vector* lhs_blocks, + std::vector* rhs_blocks) { + return BlockMemAllocator::allocateSlices(d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks); + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + BlockMemAllocator::deallocate(d, handle); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0, + /*offset*/ 0); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(*rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(const OutputMapper& output_mapper, const LhsBlock& lhsBlock, + const RhsBlock& rhsBlock, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + const ResScalar alpha, const ResScalar beta) { + // Default GEBP kernel does not support beta. + eigen_assert(beta == ResScalar(1)); + static const int kComputeStrideFromBlockDimensions = -1; + GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, + /*strideA*/ kComputeStrideFromBlockDimensions, + /*strideB*/ kComputeStrideFromBlockDimensions, + /*offsetA*/ 0, /*offsetB*/ 0); + } + + private: + // These are dimensions of the original Tensors, and selected block sizes. The + // actual block sizes passed to all function above might be smaller because of + // the partial blocks at the end. + const StorageIndex m; + const StorageIndex k; + const StorageIndex n; + const StorageIndex bm; + const StorageIndex bk; + const StorageIndex bn; +}; + +} // end namespace internal + +// Tensor contraction params that should enable to get from output matrix +// 2-dimensional coordinates to the output tensor dimensions. +struct TensorContractionParams { + // TensorContraction evaluator assumes that both tensors are in ColMajor + // layout, if tensors are in RowMajor evaluator swap lhs with rhs. + bool swapped_arguments; +}; + +// Output kernel allows to fuse operations into the tensor contraction. +// +// Examples: +// 1. Elementwise Relu transformation following Conv2D. +// 2. AddBias to the Conv2D output channels dimension. +// +// The NoOpOutputKernel implements an output kernel that does absolutely nothing. +struct NoOpOutputKernel { + /** + * Tensor contraction evaluator calls this kernel after finishing each block + * of output matrix. Output blocks belong to the 2-dimensional output tensor. + * + * TensorContractionParams contains contraction dimensions information + * required to map output 2-d space into the expected output tensor space + * (potentially higher dimensional). + * + * \param[in] output_mapper Access to output tensor memory + * \param[in] params Tensor contraction parameters + * \param[in] i Index of a first row available through output_mapper + * \param[in] j Index of a first column available through output_mapper + * \param[in] num_rows Number of available rows + * \param[in] num_cols Number of available columns + */ + template + EIGEN_ALWAYS_INLINE void operator()(const internal::blas_data_mapper& output_mapper, + const TensorContractionParams& params, Index i, Index j, Index num_rows, + Index num_cols) const { + EIGEN_UNUSED_VARIABLE(output_mapper); + EIGEN_UNUSED_VARIABLE(params); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(num_rows); + EIGEN_UNUSED_VARIABLE(num_cols); + } +}; + +template +class TensorContractionOp + : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename internal::gebp_traits::ResScalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs, + const Indices& dims, + const OutputKernelType& output_kernel = OutputKernelType()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims), m_output_kernel(output_kernel) {} + + EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC const internal::remove_all_t& lhsExpression() const { + return m_lhs_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& rhsExpression() const { + return m_rhs_xpr; + } + + EIGEN_DEVICE_FUNC const OutputKernelType& outputKernel() const { return m_output_kernel; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Indices m_indices; + const OutputKernelType m_output_kernel; +}; + +template +struct TensorContractionEvaluatorBase { + typedef typename internal::traits::Indices Indices; + typedef typename internal::traits::LeftArgType LeftArgType; + typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::OutputKernelType OutputKernelType; + typedef typename internal::traits::Device Device; + + typedef TensorContractionOp XprType; + typedef std::remove_const_t Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef std::conditional_t(Layout) == static_cast(ColMajor), LeftArgType, RightArgType> + EvalLeftArgType; + typedef std::conditional_t(Layout) == static_cast(ColMajor), RightArgType, LeftArgType> + EvalRightArgType; + + typedef TensorEvaluator LeftEvaluatorType; + typedef TensorEvaluator RightEvaluatorType; + + static constexpr int LDims = + internal::array_size::Dimensions>::value; + static constexpr int RDims = + internal::array_size::Dimensions>::value; + static constexpr int ContractDims = internal::array_size::value; + static constexpr int NumDims = LDims + RDims - 2 * ContractDims; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + typedef DSizes Dimensions; + + EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), + op.rhsExpression()), + device), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.rhsExpression(), + op.lhsExpression()), + device), + m_device(device), + m_output_kernel(op.outputKernel()), + m_result(NULL) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + DSizes eval_left_dims; + DSizes eval_right_dims; + array, ContractDims> eval_op_indices; + if (static_cast(Layout) == static_cast(ColMajor)) { + // For ColMajor, we keep using the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[i]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[i]; + } + // We keep the pairs of contracting indices. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = op.indices()[i].first; + eval_op_indices[i].second = op.indices()[i].second; + } + } else { + // For RowMajor, we need to reverse the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; + } + // We need to flip all the pairs of contracting indices as well as + // reversing the dimensions. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; + } + } + + // Check for duplicate axes and make sure the first index in eval_op_indices + // is increasing. Using O(n^2) sorting is OK since ContractDims is small + for (int i = 0; i < ContractDims; i++) { + for (int j = i + 1; j < ContractDims; j++) { + eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first && + eval_op_indices[j].second != eval_op_indices[i].second && "contraction axes should be unique"); + if (eval_op_indices[j].first < eval_op_indices[i].first) { + numext::swap(eval_op_indices[j], eval_op_indices[i]); + } + } + } + + array lhs_strides; + lhs_strides[0] = 1; + for (int i = 0; i < LDims - 1; ++i) { + lhs_strides[i + 1] = lhs_strides[i] * eval_left_dims[i]; + } + + array rhs_strides; + rhs_strides[0] = 1; + for (int i = 0; i < RDims - 1; ++i) { + rhs_strides[i + 1] = rhs_strides[i] * eval_right_dims[i]; + } + + if (m_i_strides.size() > 0) m_i_strides[0] = 1; + if (m_j_strides.size() > 0) m_j_strides[0] = 1; + if (m_k_strides.size() > 0) m_k_strides[0] = 1; + + m_i_size = 1; + m_j_size = 1; + m_k_size = 1; + + // To compute the dimension, we simply concatenate the non-contracting + // dimensions of the left and then the right tensor. Additionally, we also + // compute the strides corresponding to the left non-contracting + // dimensions and right non-contracting dimensions. + m_lhs_inner_dim_contiguous = true; + int dim_idx = 0; + Index nocontract_idx = 0; + + for (int i = 0; i < LDims; i++) { + // find if we are contracting on index i of left tensor + bool contracting = false; + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].first == i) { + contracting = true; + break; + } + } + if (!contracting) { + // add dimension size to output dimensions + m_dimensions[dim_idx] = eval_left_dims[i]; + m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; + if (dim_idx != i) { + m_lhs_inner_dim_contiguous = false; + } + if (nocontract_idx + 1 < internal::array_size::value) { + m_i_strides[nocontract_idx + 1] = m_i_strides[nocontract_idx] * eval_left_dims[i]; + } else { + m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; + } + dim_idx++; + nocontract_idx++; + } + } + + nocontract_idx = 0; + for (int i = 0; i < RDims; i++) { + bool contracting = false; + // find if we are contracting on index i of right tensor + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].second == i) { + contracting = true; + break; + } + } + if (!contracting) { + m_dimensions[dim_idx] = eval_right_dims[i]; + if (nocontract_idx + 1 < internal::array_size::value) { + m_j_strides[nocontract_idx + 1] = m_j_strides[nocontract_idx] * eval_right_dims[i]; + } else { + m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; + } + m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; + dim_idx++; + nocontract_idx++; + } + } + + // Now compute the strides corresponding to the contracting dimensions. We + // assumed above that non-contracting axes are represented in the same order + // in the matrix as they are in the tensor. This is not the case for + // contracting axes. As the contracting axes must be of the same size in + // each tensor, we'll only look at the first tensor here. + m_rhs_inner_dim_contiguous = true; + m_rhs_inner_dim_reordered = false; + for (int i = 0; i < ContractDims; i++) { + Index left = eval_op_indices[i].first; + Index right = eval_op_indices[i].second; + + Index size = eval_left_dims[left]; + eigen_assert(size == eval_right_dims[right] && "Contraction axes must be same size"); + + if (i + 1 < static_cast(internal::array_size::value)) { + m_k_strides[i + 1] = m_k_strides[i] * size; + } else { + m_k_size = m_k_strides[i] * size; + } + m_left_contracting_strides[i] = lhs_strides[left]; + m_right_contracting_strides[i] = rhs_strides[right]; + + if (i > 0 && right < eval_op_indices[i - 1].second) { + m_rhs_inner_dim_reordered = true; + } + if (right != i) { + m_rhs_inner_dim_contiguous = false; + } + } + + // If the layout is RowMajor, we need to reverse the m_dimensions + if (static_cast(Layout) == static_cast(RowMajor)) { + for (int i = 0, j = NumDims - 1; i < j; i++, j--) { + numext::swap(m_dimensions[i], m_dimensions[j]); + } + } + + // A set of parameters that will allow output kernel to get from output + // tensor dimensions (i, j) into the original tensor dimensions. + // TODO(ezhulenev): Add parameters required to infer output tensor index for + // more complex contractions than 2x2 on internal dimension. + m_tensor_contraction_params.swapped_arguments = static_cast(Layout) == RowMajor; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + if (dest) { + evalToAsync(dest, [done]() { done(false); }); + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalToAsync(m_result, [done]() { done(true); }); + } + }); + }); + } +#endif // EIGEN_USE_THREADS + +#ifndef TENSOR_CONTRACTION_DISPATCH +#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } +#endif + +#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH +#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } +#endif + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + static_cast(this)->template evalProduct(buffer); + } + +#ifdef EIGEN_USE_THREADS + template + void evalToAsync(Scalar* buffer, EvalToCallback done) const { + static_cast(this)->template evalProductAsync(buffer, std::move(done)); + } +#endif // EIGEN_USE_THREADS + + template + void evalProductSequential(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv( + buffer); + } else { + this->template evalGemm( + buffer); + } + } + + template +#if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC +#endif + void + evalGemv(Scalar* buffer) const { + const Index rows = m_i_size; + const Index cols = m_k_size; + + typedef std::remove_const_t LhsScalar; + typedef std::remove_const_t RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; + const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; + typedef internal::TensorContractionInputMapper + LhsMapper; + + typedef internal::TensorContractionInputMapper + RhsMapper; + + LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); + RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); + + const Scalar alpha(1); + const Index resIncr(1); + + // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) + m_device.fill(buffer, buffer + rows, Scalar(0)); + + internal::general_matrix_vector_product::run(rows, cols, lhs, rhs, buffer, resIncr, alpha); + + typedef internal::blas_data_mapper OutputMapper; + m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, static_cast(0), + static_cast(0), rows, static_cast(1)); + } + + template +#if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC +#endif + void + evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + this->template evalGemmPartial(buffer, 0, k, 1); + } + + template + EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(Scalar* buffer, Index k_start, Index k_end, + int num_threads) const { + evalGemmPartial(buffer, k_start, k_end, num_threads); + } + + template + EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size); + // columns in slice on left side, rows on right side + const Index k_slice = k_end - k_start; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // define data mappers for Lhs and Rhs + typedef std::remove_const_t LhsScalar; + typedef std::remove_const_t RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + + typedef internal::TensorContractionInputMapper + LhsMapper; + + typedef internal::TensorContractionInputMapper + RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + typedef internal::TensorContractionKernel + TensorContractionKernel; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // Sizes of the blocks to load in cache. See the Goto paper for details. + internal::TensorContractionBlocking blocking( + k_slice, m, n, num_threads); + const Index kc = blocking.kc(); + const Index mc = numext::mini(m, blocking.mc()); + const Index nc = numext::mini(n, blocking.nc()); + + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + + LhsBlock blockA; + RhsBlock blockB; + + TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc); + + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + const BlockMemHandle packed_mem = kernel.allocate(this->m_device, &blockA, &blockB); + + // If a contraction kernel does not support beta, explicitly initialize + // output buffer with zeroes. + if (!TensorContractionKernel::HasBeta) { + this->m_device.fill(buffer, buffer + m * n, Scalar(0)); + } + + for (Index i2 = 0; i2 < m; i2 += mc) { + const Index actual_mc = numext::mini(i2 + mc, m) - i2; + for (Index k2 = k_start; k2 < k_end; k2 += kc) { + // make sure we don't overshoot right edge of left matrix, then pack vertical panel + const Index actual_kc = numext::mini(k2 + kc, k_end) - k2; + kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); + + // If kernel supports beta, there is no need to initialize output + // buffer with zeroes. + const Scalar alpha = Scalar(1); + const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start) ? Scalar(0) : Scalar(1); + + // series of horizontal blocks + for (Index j2 = 0; j2 < n; j2 += nc) { + // make sure we don't overshoot right edge of right matrix, then pack block + const Index actual_nc = numext::mini(j2 + nc, n) - j2; + kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc); + + // call gebp (matrix kernel) + // The parameters here are copied from Eigen's GEMM implementation + const OutputMapper output_mapper = output.getSubMapper(i2, j2); + kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc, alpha, beta); + + // We are done with this [i2, j2] output block. + if (use_output_kernel && k2 + kc >= k_end) { + m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, actual_mc, actual_nc); + } + } + } + } + + kernel.deallocate(this->m_device, packed_mem); + } + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } + + protected: + Dimensions m_dimensions; + + contract_t m_k_strides; + contract_t m_left_contracting_strides; + contract_t m_right_contracting_strides; + + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + left_nocontract_t m_i_strides; + right_nocontract_t m_j_strides; + left_nocontract_t m_left_nocontract_strides; + right_nocontract_t m_right_nocontract_strides; + + Index m_i_size; + Index m_j_size; + Index m_k_size; + + TensorContractionParams m_tensor_contraction_params; + + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; + const Device EIGEN_DEVICE_REF m_device; + OutputKernelType m_output_kernel; + EvaluatorPointerType m_result; +}; + +// evaluator for default device +template +struct TensorEvaluator, Device> + : public TensorContractionEvaluatorBase< + TensorEvaluator, Device>> { + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef std::remove_const_t Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + static constexpr int Layout = TensorEvaluator::Layout; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef std::conditional_t(ColMajor), LeftArgType, RightArgType> EvalLeftArgType; + typedef std::conditional_t(ColMajor), RightArgType, LeftArgType> EvalRightArgType; + + static constexpr int LDims = + internal::array_size::Dimensions>::value; + static constexpr int RDims = + internal::array_size::Dimensions>::value; + static constexpr int ContractDims = internal::array_size::value; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static constexpr int NumDims = LDims + RDims - 2 * ContractDims; + + // Could we use NumDimensions here? + typedef DSizes Dimensions; + + TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + template + void evalProduct(Scalar* buffer) const { + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer)); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h new file mode 100644 index 00000000000..7fbe30a9f9f --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -0,0 +1,69 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +enum { ShardByRow = 0, ShardByCol = 1 }; + +// Default Blocking Strategy +template +class TensorContractionBlocking { + public: + /* + adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h` + requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h` + which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in + `GeneralBlockPanelKernel.h` which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in + `GeneralBlockPanelKernel.h` (else HIPCC will error out) + + However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` + results in NVCC erroring out with the following error + + ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901: + dynamic initialization is not supported for function-scope static variables within a __device__/__global__ + function + */ + +#if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC +#endif + TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) + : kc_(k), mc_(m), nc_(n) { + if (ShardingType == ShardByCol) { + computeProductBlockingSizes(kc_, mc_, nc_, num_threads); + } else { + computeProductBlockingSizes(kc_, nc_, mc_, num_threads); + } + + const int rhs_packet_size = internal::packet_traits::size; + kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? kc_ : (kc_ / rhs_packet_size) * rhs_packet_size; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } + + private: + StorageIndex kc_; + StorageIndex mc_; + StorageIndex nc_; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h new file mode 100644 index 00000000000..dbea8aa92af --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -0,0 +1,7 @@ + +#if defined(__clang__) || defined(__GNUC__) +#warning \ + "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file" +#endif + +#include "TensorContractionGpu.h" diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h new file mode 100644 index 00000000000..780e8961eae --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -0,0 +1,1387 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, + Scalar* rhs_shmem, const Index m_size, + const Index n_size, const Index k_size) { + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } + +#define writeRegToShmem() \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); + + internal::scalar_cast_op conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + +#undef prefetchIntoRegisters +#undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) +#else +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) +#endif + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res +} + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(512, 1) +#else +__launch_bounds__(512) +#endif + EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size, + const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + +template +__device__ __forceinline__ void EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i = 0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg = lhs.template loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg = lhs.template loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x = lhs(row + 0, col); \ + reg.y = lhs(row + 1, col); \ + reg.z = lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x = lhs(row + 0, col); \ + reg.y = lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x = lhs(row + 0, col); \ + } \ + } \ + } + + Index lhs_vert = base_m + threadIdx.x * 4; + + for (Index k = 0; k < k_size; k += 16) { + lhs_pf0 = internal::pset1(0); + rhs_pf0 = internal::pset1(0); + + Index lhs_horiz = threadIdx.y + k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k + (threadIdx.x % 4) * 4; + Index rhs_horiz0 = (threadIdx.x >> 2) + threadIdx.y * 4 + base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2; + // the following can be a bitwise operation..... some day. + if ((threadIdx.x % 8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } +#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); +#else + x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); + x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); +#endif + if ((threadIdx.x % 8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x >> 3) + threadIdx.y * 2][threadIdx.x % 8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x >> 3) + threadIdx.y * 2 + 32][threadIdx.x % 8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y + 16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + +#define add_vals(fl1, fl2, fr1, fr2) \ + results[0].x += fl1.x * fr1.x; \ + results[0].y += fl1.y * fr1.x; \ + results[0].z += fl2.x * fr1.x; \ + results[0].w += fl2.y * fr1.x; \ + \ + results[1].x += fl1.x * fr1.y; \ + results[1].y += fl1.y * fr1.y; \ + results[1].z += fl2.x * fr1.y; \ + results[1].w += fl2.y * fr1.y; \ + \ + results[2].x += fl1.x * fr2.x; \ + results[2].y += fl1.y * fr2.x; \ + results[2].z += fl2.x * fr2.x; \ + results[2].w += fl2.y * fr2.x; \ + \ + results[3].x += fl1.x * fr2.y; \ + results[3].y += fl1.y * fr2.y; \ + results[3].z += fl2.x * fr2.y; \ + results[3].w += fl2.y * fr2.y; + + __syncthreads(); + +// Do the multiplies. +#pragma unroll + for (int koff = 0; koff < 16; koff++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature >> 1) + 32 * ((koff % 4) / 2)][koff / 4 + (koff % 2) * 4]; + float2 fr2 = rhs_shmem2[(start_feature >> 1) + 1 + 32 * ((koff % 4) / 2)][koff / 4 + (koff % 2) * 4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y * 4 + base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + +template +__device__ __forceinline__ void EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i = 0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + Index lhs_vert = base_m + threadIdx.x * 4 + (threadIdx.y % 4) * 32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y / 4 + k + 24) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + lhs_pf2 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 16)); + lhs_pf3 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 24)); + } else if ((threadIdx.y / 4 + k + 16) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + lhs_pf2 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 16)); + } else if ((threadIdx.y / 4 + k + 8) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + } else if ((threadIdx.y / 4 + k) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y / 4 + k + 24) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + lhs_pf2 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 16)); + lhs_pf3 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 24)); + } else if ((threadIdx.y / 4 + k + 16) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + lhs_pf2 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 16)); + } else if ((threadIdx.y / 4 + k + 8) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + lhs_pf1 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k + 8)); + } else if ((threadIdx.y / 4 + k) < k_size) { + lhs_pf0 = lhs.template loadPacket(lhs_vert, (threadIdx.y / 4 + k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y / 4 + k + 24) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16)); + lhs_pf2.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 16)); + lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24)); + lhs_pf3.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 24)); + lhs_pf3.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 24)); + } else if ((threadIdx.y / 4 + k + 16) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16)); + lhs_pf2.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 16)); + } else if ((threadIdx.y / 4 + k + 8) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8)); + } else if ((threadIdx.y / 4 + k) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y / 4 + k + 24) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16)); + lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24)); + lhs_pf3.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 24)); + } else if ((threadIdx.y / 4 + k + 16) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16)); + } else if ((threadIdx.y / 4 + k + 8) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8)); + } else if ((threadIdx.y / 4 + k) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y / 4 + k + 24) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24)); + } else if ((threadIdx.y / 4 + k + 16) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16)); + } else if ((threadIdx.y / 4 + k + 8) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8)); + } else if ((threadIdx.y / 4 + k) < k_size) { + lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k)); + } + } + } + __syncthreads(); + Index rhs_vert = k + threadIdx.x * 4; + Index rhs_horiz0 = threadIdx.y * 2 + base_n; + Index rhs_horiz1 = threadIdx.y * 2 + 1 + base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k + threadIdx.x * 4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k + threadIdx.x * 4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y + 32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y + 64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y + 96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4) \ + results[0].x += a_feat1.x * f1.x; \ + results[1].x += a_feat1.x * f1.y; \ + results[2].x += a_feat1.x * f2.x; \ + results[3].x += a_feat1.x * f2.y; \ + results[4].x += a_feat1.x * f3.x; \ + results[5].x += a_feat1.x * f3.y; \ + results[6].x += a_feat1.x * f4.x; \ + results[7].x += a_feat1.x * f4.y; \ + \ + results[0].y += a_feat1.y * f1.x; \ + results[1].y += a_feat1.y * f1.y; \ + results[2].y += a_feat1.y * f2.x; \ + results[3].y += a_feat1.y * f2.y; \ + results[4].y += a_feat1.y * f3.x; \ + results[5].y += a_feat1.y * f3.y; \ + results[6].y += a_feat1.y * f4.x; \ + results[7].y += a_feat1.y * f4.y; \ + \ + results[0].z += a_feat2.x * f1.x; \ + results[1].z += a_feat2.x * f1.y; \ + results[2].z += a_feat2.x * f2.x; \ + results[3].z += a_feat2.x * f2.y; \ + results[4].z += a_feat2.x * f3.x; \ + results[5].z += a_feat2.x * f3.y; \ + results[6].z += a_feat2.x * f4.x; \ + results[7].z += a_feat2.x * f4.y; \ + \ + results[0].w += a_feat2.y * f1.x; \ + results[1].w += a_feat2.y * f1.y; \ + results[2].w += a_feat2.y * f2.x; \ + results[3].w += a_feat2.y * f2.y; \ + results[4].w += a_feat2.y * f3.x; \ + results[5].w += a_feat2.y * f3.y; \ + results[6].w += a_feat2.y * f4.x; \ + results[7].w += a_feat2.y * f4.y; + + lhs_shmem2[threadIdx.y / 4][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y / 4 + 8][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y / 4 + 16][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y / 4 + 24][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y / 4 + 32][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y / 4 + 40][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y / 4 + 48][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y / 4 + 56][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + +// Do the multiplies. +#pragma unroll + for (int koff = 0; koff < 32; koff++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature / 2 + (koff % 4) * 32][koff / 4]; + float2 br2 = rhs_shmem2[start_feature / 2 + 1 + (koff % 4) * 32][koff / 4]; + float2 br3 = rhs_shmem2[start_feature / 2 + 2 + (koff % 4) * 32][koff / 4]; + float2 br4 = rhs_shmem2[start_feature / 2 + 3 + (koff % 4) * 32][koff / 4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + +#undef add_vals + + __syncthreads(); + Index horiz_base = (threadIdx.y / 4) * 8 + base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else +__launch_bounds__(256) +#endif + EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size, + const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64 * 32]; + __shared__ float2 rhs_shmem[128 * 8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else +__launch_bounds__(256) +#endif + EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16( + lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +struct TensorEvaluator, GpuDevice> + : public TensorContractionEvaluatorBase, GpuDevice> > { + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef std::remove_const_t Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + static constexpr int Layout = TensorEvaluator::Layout; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef std::conditional_t(ColMajor), LeftArgType, RightArgType> EvalLeftArgType; + typedef std::conditional_t(ColMajor), RightArgType, LeftArgType> EvalRightArgType; + + static constexpr int LDims = + internal::array_size::Dimensions>::value; + static constexpr int RDims = + internal::array_size::Dimensions>::value; + static constexpr int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static constexpr int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef std::remove_const_t LhsScalar; + typedef std::remove_const_t RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { + EIGEN_STATIC_ASSERT((internal::is_same::value), + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS); + } + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } + } + + template + struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, + const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_GPU_KERNEL((EigenContractionKernel), num_blocks, + block_size, 0, device, lhs, rhs, output, m, n, k); + } + }; + + template + struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, + const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16), num_blocks, + block_size, 0, device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel), num_blocks, + block_size, 0, device, lhs, rhs, output, m, n, k); + } + } + }; + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)) + this->m_device.fill(buffer, buffer + m * n, Scalar(0)); + + typedef internal::TensorContractionInputMapper + LhsMapper; + + typedef internal::TensorContractionInputMapper + RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + +#if defined(EIGEN_USE_HIP) + setGpuSharedMemConfig(hipSharedMemBankSizeEightByte); +#else + setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte); +#endif + + LaunchKernels::Run(lhs, rhs, output, m, n, k, + this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and EIGEN_GPUCC +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h new file mode 100644 index 00000000000..6367db96409 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -0,0 +1,533 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +enum { Rhs = 0, Lhs = 1 }; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ +/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the +/// default make pointer is used which is scalar * for CoeffLoader. +template class MakePointer_ = MakePointer> +struct CoeffLoader; + +template class MakePointer_ = MakePointer> +class BaseTensorContractionMapper; + +template class MakePointer_> +struct CoeffLoader { + enum { DirectOffsets = false }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { + eigen_assert(false && "unsupported"); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type data() const { + eigen_assert(false && "unsupported"); + return NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { + return m_tensor.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Tensor::PacketReturnType packet(typename Tensor::Index index) const { + return m_tensor.template packet(index); + } + + private: + const Tensor m_tensor; +}; + +template class MakePointer_> +struct CoeffLoader { + enum { DirectOffsets = true }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { m_data += offset; } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type data() const { + return m_data; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { + return loadConstant(m_data + index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Tensor::PacketReturnType packet(typename Tensor::Index index) const { + return internal::ploadt_ro(m_data + index); + } + + private: + typedef typename Tensor::Scalar Scalar; + + typename MakePointer_::Type m_data; +}; + +template class MakePointer_ = MakePointer> +class SimpleTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC SimpleTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, const contract_t& contract_strides, + const contract_t& k_strides) + : m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) {} + + enum { DirectOffsets = CoeffLoader::DirectOffsets }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_tensor.offsetBuffer(offset); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void prefetch(Index /*i*/) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 + Index nocontract_val = left ? row : col; + Index linidx = 0; + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + if (array_size::value > 0) { + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + } + + return linidx; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, + const Index distance) const { + const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + if (array_size::value > array_size::value) { + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + if (array_size::value > 0) { + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + } + return IndexPair(linidx[0], linidx[1]); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { + // Only claim alignment when we can compute the actual stride (ie when we're + // dealing with the lhs with inner_dim_contiguous. This is because the + // matrix-vector product relies on the stride when dealing with aligned inputs. + return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { + return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; + } + + const CoeffLoader& tensor() const { return m_tensor; } + + const nocontract_t& nocontract_strides() const { return m_nocontract_strides; } + const nocontract_t& ij_strides() const { return m_ij_strides; } + const contract_t& contract_strides() const { return m_contract_strides; } + const contract_t& k_strides() const { return m_k_strides; } + + protected: + CoeffLoader m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + +template class MakePointer_> +class BaseTensorContractionMapper + : public SimpleTensorContractionMapper { + public: + typedef SimpleTensorContractionMapper + ParentMapper; + + EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, const contract_t& contract_strides, + const contract_t& k_strides) + : ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + std::enable_if_t::size == packet_size, PacketT> + load(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i + packet_size - 1, j) == index + packet_size - 1); + return this->m_tensor.template packet(index); + } + + const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && + (lastIdx - first) == (packet_size - 1)) { + return this->m_tensor.template packet(first); + } + + EIGEN_ALIGN_MAX Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + EIGEN_UNROLL_LOOP + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + std::enable_if_t::size != packet_size, PacketT> + load(Index i, Index j) const { + const Index requested_packet_size = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar data[requested_packet_size]; + + const IndexPair indexPair = this->computeIndexPair(i, j, requested_packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < requested_packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + return this->load(i, j); + } +}; + +template class MakePointer_> +class BaseTensorContractionMapper + : public SimpleTensorContractionMapper { + public: + typedef SimpleTensorContractionMapper + ParentMapper; + + EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, const contract_t& contract_strides, + const contract_t& k_strides) + : ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload(data); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload(data); + } +}; + +template class MakePointer_ = MakePointer> +class TensorContractionSubMapper { + public: + typedef BaseTensorContractionMapper + ParentMapper; + typedef TensorContractionSubMapper + Self; + typedef Self LinearMapper; + typedef Self SubMapper; + + enum { + // We can use direct offsets iff the parent mapper supports then and we can compute the strides. + // TODO: we should also enable direct offsets for the Rhs case. + UseDirectOffsets = + ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size::value > 0) + }; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { + // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute + // this offset every time we attempt to access a coefficient. + if (UseDirectOffsets) { + Index stride = m_base_mapper.stride(); + m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); + } + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper(i, 0); + } + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper(i, j); + } + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacketPartial(Index i, Index j, Index, Index = 0) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper.template load(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const { + if (UseDirectOffsets) { + m_base_mapper.storePacket(i, 0, p); + } + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + if (UseDirectOffsets) { + return LinearMapper(m_base_mapper, i, j); + } + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const { + if (UseDirectOffsets) { + return SubMapper(m_base_mapper, i, j); + } + return SubMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const Index stride() const { return m_base_mapper.stride(); } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { + return false; + } + + const ParentMapper& base_mapper() const { return m_base_mapper; } + Index vert_offset() const { return m_vert_offset; } + Index horiz_offset() const { return m_horiz_offset; } + + private: + ParentMapper m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + +template class MakePointer_ = MakePointer> +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + public: + typedef Scalar_ Scalar; + typedef BaseTensorContractionMapper + Base; + typedef TensorContractionSubMapper + SubMapper; + typedef SubMapper VectorMapper; + typedef SubMapper LinearMapper; + + EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader& get_tensor() const { + return Base::m_tensor; + } +}; + +template +struct TensorContractionInputMapperTrait; + +template class MakePointer_> +struct TensorContractionInputMapperTrait< + TensorContractionInputMapper > { + typedef Tensor_ XprType; + static const bool inner_dim_contiguous = inner_dim_contiguous_; + static const bool inner_dim_reordered = inner_dim_reordered_; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h new file mode 100644 index 00000000000..c7203a9306e --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -0,0 +1,1654 @@ +// This file is part of Eigen, a lightweight C++ template library for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not +// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorContractionSycl.h + * + * \brief: + * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend + * + *****************************************************************/ + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_DISABLE_GEMV +/*! + * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector + * contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor/vector + * + * \tparam StorageIndex determines the Index type. + * + * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group + * + * \tparam CFactor: determines the number of contracting element to be process by each thread + * + * \tparam NCFactor: determines the number of non-contracting element to be process by each thread + */ +template +struct TVPanelSize { + // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimNC: determines the tile size for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor; + // TileSizeDimC: determines the tile size for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; + // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; + // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = false; +}; +#endif + +/*! + * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor + contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor + * + * \tparam StorageIndex: determines the Index type. + * + * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro). + * + * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro). + * + * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered + */ + +template +struct TTPanelSize { + // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered + static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK; + // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the + // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro// +#ifndef EIGEN_SYCL_REG_M + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; +#endif +// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the +// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro +#ifndef EIGEN_SYCL_REG_N + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; +#endif + // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimM: determines the tile size for the m dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; + // TileSizeDimN: determines the tile size for the n dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; + // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisible by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN)); + // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisible by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM)); + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = true; + // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by + // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory) + static EIGEN_CONSTEXPR bool DoubleBuffer = +#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER + false; +#else + true; +#endif +}; + +/* ! + * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to + * specialize the contraction algorithm based on device support for dedicated local memory. + */ +enum class contraction_type { local, no_local }; +/* ! + * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private). + */ +enum class data_source { global_mem, local_mem, private_mem }; + +/*! + * \brief read, a template function used for loading the data from global + memory. This function is used to guarantee coalesced and vectorized load whenever possible + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex is the contracting dim index + * + * \param ld: is the leading dimension of the flattened tensor + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) { + const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex; + const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex; + return tensorMapper.get_tensor().template packet(row + (col * ld)); +} + +/*! + * \brief read, special overload of read function, when the read access is not vectorized + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex: is the contracting dim index + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) { + const StorageIndex row = (IsRhs) ? CIndex : NCIndex; + const StorageIndex col = (IsRhs) ? NCIndex : CIndex; + return tensorMapper(row, col); +} + +/*! + * \brief write, a template function used for storing the data to local memory. This function is used to guarantee + * coalesced and vectorized store whenever possible. + * + * \tparam StorageIndex: determines the Index type + * + * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + * + * \param CIndex is the contracting dim index + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t
write( + PacketType &packet_data, DataScalar ptr) { + EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits::size; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; i++) { + *ptr = PacketWrapper::scalarize(i, packet_data); + ptr += ld; + } +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function + * is used to guarantee coalesced and vectorized store whenever possible. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename std::enable_if_t::size != 1 && dt == data_source::global_mem, + void> + write(PacketType &packet_data, DataScalar *ptr) { + ::Eigen::internal::pstoreu(ptr, packet_data); +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename std::enable_if_t::size == 1 && dt == data_source::global_mem, + void> + write(PacketType &packet_data, DataScalar *ptr) { + *ptr = packet_data; +} + +/*! + * \brief check_boundary: is used to check the edge condition for non-internal blocks. + * + * \tparam is_internal: determines if the block is internal + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) { + return true; +} + +/*! + * \brief check_boundary: specialization of the check_boundary for non-internal blocks. + * + * \param cond: true when the data is in range. Otherwise false + */ +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool cond) { + return cond; +} + +/*! + * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed + * by each workgroup. + * + * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed + * + * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode + * + * \tparam PacketType: determines the type of packet + * + * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be + * packetType; Otherwise it will be scalar Type + * + * \param elements_per_access determines the size of each element based on OutType + * + * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and + * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + * + * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + */ +template +struct BlockProperties { + static EIGEN_CONSTEXPR bool packet_load = packet_load_; + typedef typename Eigen::internal::unpacket_traits::type OutScalar; + static EIGEN_CONSTEXPR bool is_rhs = is_rhs_; + typedef std::conditional_t OutType; + static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits::size; + static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs); + static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1); + static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access); +}; + +/*! + * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see + * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup, + * work-items + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \param linearLocalThreadId: determines the linearized location of a thread within a work-group + * + * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when + * tall/skinny algorithm is used + * + * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of + * the flattened tensor. + * + * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the + * flattened tensor. It will be > 1 when tall/skinny algorithm is used. + * + * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a + * flattened tensor + * + * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a + * flattened tensor + * + * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the + * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used. + * + * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or + * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be + * resolve by compiler. + */ +template +struct ThreadProperties { + const StorageIndex linearLocalThreadId; + const StorageIndex kGroupId; + const StorageIndex mGroupOffset; + const StorageIndex nGroupOffset; + const StorageIndex kGroupOffset; + const StorageIndex mLocalOffset; + const StorageIndex nLocalOffset; + const StorageIndex mGlobalOffset; + const StorageIndex nGlobalOffset; + StorageIndex kSize; + const bool is_internal; + // this is used to adjust the last block + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties( + const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, + const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, + const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, + StorageIndex kSize_, const bool is_internal_) + : linearLocalThreadId(linearLocalThreadId_), + kGroupId(kGroupId_), + mGroupOffset(mGroupOffset_), + nGroupOffset(nGroupOffset_), + kGroupOffset(kGroupOffset_), + mLocalOffset(mLocalOffset_), + nLocalOffset(nLocalOffset_), + mGlobalOffset(mGlobalOffset_), + nGlobalOffset(nGlobalOffset_), + kSize(kSize_), + is_internal(is_internal_) {} +}; + +/*! + * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory + access is used to guarantee that always the memory access are coalesced. + * + * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output. + Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny + contraction is used. So in this case, a final reduction step is required to compute final output. + + * \tparam contraction_tp: it is an enum value representing whether the local memory/no local memory implementation of + the algorithm to be used + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param groupSizeM: a logical number determining the number of work-group for m dimension + * + * \param groupSizeN: a logical number determining the number of work-group for n dimension + * + * \param numTiles: determines total number of tiles on the k dimension + * + * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + */ +template +class TensorContractionKernel { + public: + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + static EIGEN_CONSTEXPR bool is_lhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + + typedef BlockProperties + LHSBlockProperties; + + typedef BlockProperties + RHSBlockProperties; + + static EIGEN_CONSTEXPR StorageIndex NStride = + contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride; + + typedef cl::sycl::accessor Scratch; + typedef cl::sycl::multi_ptr local_ptr; + typedef OutScalar * /*cl::sycl::multi_ptr*/ private_ptr; + typedef std::conditional_t tile_ptr; + static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local + ? Properties::TileSizeDimM + Properties::BC + : Properties::WorkLoadPerThreadM; + static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local + ? Properties::TileSizeDimN + Properties::BC + : Properties::WorkLoadPerThreadN; + static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + + /** + * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not + * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to + * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting + * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out + * different type of memory needed when local/no_local memory computation is called. + * + * \tparam contraction_type: it is an enum value representing whether the local memory/no local memory implementation + of the algorithm to be used + * \tparam the private memory size + * \param ptr the tile memory pointer type + */ + template + struct MemHolder { + tile_ptr ptr; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {} + }; + /** + * \brief specialization of memHolder class when no local memory kernel is used. + */ + template + struct MemHolder { + OutScalar ptr[MemSize] = {OutScalar{0}}; + }; + /** + * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from + * global memory to local/private memory when local/no_local algorithm used. + * + * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param lhs_extract_index: determines the position of each thread on a local memory for lhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param rhs_extract_index: determines the position of each thread on a local memory for rhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the + * same as lhs_scratch_extract for private memory. + * + * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the + * same as rhs_scratch_extract for private memory. + */ + struct TiledMemory { + MemHolder lhs_scratch_extract; + MemHolder rhs_scratch_extract; + tile_ptr lhs_scratch_ptr_compute; + tile_ptr rhs_scratch_ptr_compute; + const std::pair lhs_extract_index; + const std::pair rhs_extract_index; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties &, local_ptr, + std::enable_if_t * = 0) + : lhs_scratch_extract{}, + rhs_scratch_extract{}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr), + lhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})), + rhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties &thread_properties, + local_ptr block_start_ptr, + std::enable_if_t * = 0) + : lhs_scratch_extract{block_start_ptr}, + rhs_scratch_extract{lhs_scratch_extract.ptr + + ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset), + lhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)), + rhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)) {} + }; + + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex groupSizeM; + const StorageIndex groupSizeN; + const StorageIndex numTiles; + const TripleDim triple_dim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex groupSizeN_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : scratch(scratch_), + lhs(lhs_), + rhs(rhs_), + out_res(out_res_), + groupSizeM(groupSizeM_), + groupSizeN(groupSizeN_), + numTiles(numTiles_), + triple_dim(triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM; + const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM; + const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM; + const StorageIndex tmp = itemID.get_group(0) / groupSizeM; + const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN; + const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN; + const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM; + const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN; + const StorageIndex mLocalOffset = PacketSize * mLocalThreadId; + const StorageIndex nLocalOffset = NStride * nLocalThreadId; + const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset; + const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset; + + const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK; + StorageIndex kGroupOffset = kGroupId * kSizePerWG; + const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM && + triple_dim.N - nGroupOffset >= Properties::TileSizeDimN && + triple_dim.K - kGroupOffset >= kSizePerWG; + // this is used to adjust the last block + StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset); + // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to + // tile + kGroupOffset += kSize; + + auto thread_properties = + ThreadProperties(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset, + mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal); + + auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); + + (thread_properties.is_internal) ? compute_panel(itemID, thread_properties, out_ptr) + : compute_panel(itemID, thread_properties, out_ptr); + } + // The compute block computes the contraction operation private block for each thread and store the resutl in the + // privateRes memory of Each computation the compute block function is independent of local and no local concepts as + // it only compute the block on each thread's private memory space + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, + PacketReturnType *privateRes) const { + StorageIndex idx = 0; + EIGEN_CONSTEXPR StorageIndex lhs_stride = + contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) { + auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)}; + StorageIndex lhs_index = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType lhsPack{}; + Eigen::TensorSycl::internal::PacketWrapper::set_packet(lhsPack, + lhs_block_ptr + lhs_index); + privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]); + + lhs_index += lhs_stride; + idx++; + } + } + } + // The store function write the computed contraction operation in the private memory of each thread to the global + // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base + // class. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, + StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const { + auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC { + return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N); + }; + // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is + // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId* + // WorkLoadPerThreadN slice of N + EIGEN_CONSTEXPR StorageIndex GlobalNStride = + contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) { + // output leading dimension + StorageIndex outputLD = 0; + // When local memory is used the PrivateNstride is always 1 because the coalesced access on N is loaded into Local + // memory and extracting from local to global is the same as no transposed version. However, when local memory is + // not used and RHS is transposed we packetize the load for RHS. + EIGEN_UNROLL_LOOP + for (StorageIndex nId = 0; nId < PrivateNStride; nId++) { + StorageIndex globalRow = mGlobalOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType privetOut = privateRes[wLPTM]; + if (check_boundary(chk_bound(globalRow, nId))) { + // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second + // StorageIndex Therefore it is always coalesced layout + write(privetOut, out_ptr + outputLD + globalRow); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex mId = 0; mId < PacketSize; mId++) { + StorageIndex mOffset = globalRow + mId; + if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) { + out_ptr[mOffset + outputLD] = + Eigen::TensorSycl::internal::PacketWrapper::scalarize(mId, privetOut); + } + } + } + globalRow += (PacketSize * Properties::LocalThreadSizeM); + } + outputLD += triple_dim.M; + privateRes += Properties::WorkLoadPerThreadM / PacketSize; + } + out_ptr += (GlobalNStride * outputLD); + + nGlobalOffset += (PrivateNStride * GlobalNStride); + } + } + // when no local memory is used the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t extract_block( + const Input &inpt, PrivateReg private_ptr, const std::pair &, + const StorageIndex &ncOffset, const StorageIndex cOffset) const { + EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = + InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; + EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = + InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM; + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + StorageIndex cIndex = cOffset; + + EIGEN_UNROLL_LOOP + for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) { + StorageIndex ncIndex = ncOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) { + if (check_boundary(chk_bound(cIndex, ncIndex))) { + auto val = + read(inpt, ncIndex, cIndex, ld); + + write(val, private_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (ncInd < NC && cInd < triple_dim.K) + ? read( + inpt, ncInd, cInd, ld) + : OutScalar(0); + write( + val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC)); + } + } + + // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So + // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread. + ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1) + ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC + : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC); + private_ptr += InputBlockProperties::nc_stride; + } + // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC + private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC; + cIndex += InputBlockProperties::c_stride; + } + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair local_id_extract( + const StorageIndex &linearLocalThreadId) { + const StorageIndex localThreadNC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + const StorageIndex localThreadC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + return std::pair(localThreadNC, localThreadC); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t sync_mem( + const cl::sycl::nd_item<1> &, bool &db_offset) noexcept { + db_offset = !db_offset; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t sync_mem( + const cl::sycl::nd_item<1> &itemID, bool &) noexcept { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t sync_mem( + const cl::sycl::nd_item<1> &, bool &) noexcept { + return; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t + sync_thread(const cl::sycl::nd_item<1> & +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID +#endif + ) noexcept { +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID.barrier(cl::sycl::access::fence_spacce::local_space); +#else + return; +#endif + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t + sync_thread(const cl::sycl::nd_item<1> &itemID) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t sync_thread(const cl::sycl::nd_item<1> &) { + return; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + TiledMemory &tiled_input_block, + PacketReturnType *privateRes, + bool &db_offset) const { + // Tiling the Rhs block from global to local memory + extract_block( + rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR), + tiled_input_block.rhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + sync_thread(itemID); + + // Tiling the Lhs block from global to local memory + extract_block( + lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK), + tiled_input_block.lhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + // itemID.barrier(cl::sycl::access::fence_space::local_space); + sync_thread(itemID); + // switch to compute mede + StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK); + StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR); + // Loop over the values of a single tile + for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) { + compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset, + tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes); + lhs_offset += LSDL; + rhs_offset += LSDR; + } + // computing the K index for the next tile + thread_properties.kSize -= Properties::TileSizeDimK; + sync_mem(itemID, db_offset); + } + + // when local memory is available the following compute_panel will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + OutPtr out_ptr) const { + auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()}; + // Allocate register space + PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = { + PacketReturnType{0}}; + bool db_offset = 0; + + while (thread_properties.kSize >= Properties::TileSizeDimK) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + if (thread_properties.kSize > 0) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + + // Storing the final results in the output + store(1) : RHSBlockProperties::nc_stride>( + out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset, + thread_properties.nGlobalOffset); + } + // When local memory is available the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t extract_block( + const Input &inpt, Local local_ptr, const std::pair &local_index, + const StorageIndex &ncOffset, const StorageIndex cOffset) const { + EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = + InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; + EIGEN_CONSTEXPR StorageIndex LoadPerThread = + InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs; + EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; + static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) && + (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)), + " LocalOffset must be divisible by stride"); + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + StorageIndex localThreadNC = local_index.first; + StorageIndex localThreadC = local_index.second; + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + EIGEN_UNROLL_LOOP + for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) { + const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC); + const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC); + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + if (check_boundary(chk_bound(CIndex, NCIndex))) { + auto val = + read(inpt, NCIndex, CIndex, ld); + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::c_stride * localThreadC * LSD)); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (nCInd < NC && cInd < triple_dim.K) + ? read( + inpt, nCInd, cInd, ld) + : OutScalar(0); + + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::c_stride * localThreadC + + (InputBlockProperties::is_coalesced_layout ? 0 : i)) * + LSD)); + } + } + localThreadNC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + localThreadC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + } + } +}; + +#ifndef EIGEN_SYCL_DISABLE_GEMV + +/*! + * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special + * case of Tensor Tensor contraction. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs) + * + * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs) + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam KFactor: determines the number of elements in K dimension in a Tile + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector + * + * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output. + * Otherwise, the result of contraction will be written iin a temporary buffer. + * + * \param scratch: determines the local memory containing the vector block for each work-group + * + * \param vec: determines the vector input (tensor mapper) + * + * \param mat: determines the tensor input (tensor mapper) + * + * \param out_res: determines the output vector containing the contraction result + * + * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension + * + * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor + * + * \param contractDim: determines the size of non contracting dimension for the flattened tensor + * + */ +template +struct GeneralVectorTensor { + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + typedef cl::sycl::accessor Scratch; + + static EIGEN_CONSTEXPR StorageIndex OutScratchOffset = + KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + + // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make + // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true. + typedef BlockProperties + VecBlockProperties; + + Scratch scratch; + const VectorMapper vec; + const TensorMapper mat; + OutAccessor out_res; + const StorageIndex nonContractGroupSize; + const StorageIndex nonContractDim; + const StorageIndex contractDim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, + const TensorMapper mat_, OutAccessor out_res_, + const StorageIndex nonContractGroupSize_, + const StorageIndex nonContractDim_, + const StorageIndex contractDim_) + : scratch(scratch_), + vec(vec_), + mat(mat_), + out_res(out_res_), + nonContractGroupSize(nonContractGroupSize_), + nonContractDim(nonContractDim_), + contractDim(contractDim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + auto scratch_ptr = scratch.get_pointer(); + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC + : linearLocalThreadId % Properties::LocalThreadSizeNC; + StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC + : linearLocalThreadId / Properties::LocalThreadSizeNC; + const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize; + const StorageIndex nonContractGroupId = + is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize; + const StorageIndex contractGroupId = + is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize; + auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim); + + const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC; + const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC; + auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId; + const StorageIndex globalContractDimOffset = contractGroupOffset + contractId; + auto local_output = scratch_ptr + OutScratchOffset; + const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC && + contractDim - contractGroupOffset >= Properties::TileSizeDimC; + is_internal + ? compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex) + : compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel( + const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, + OutPtr out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + OutScalar *scratch_ptr, const StorageIndex contractGroupOffset, +#endif + const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, + StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, + StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) { + OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)}; + // Reading the vector +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId; + extract_block(vec, scratch_ptr, linearLocalThreadId, + vectorOffset, contractDim); + + itemID.barrier(cl::sycl::access::fence_space::local_space); + auto in_scratch_ptr = scratch_ptr + contractId; +#endif + + StorageIndex privateOffsetC = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) { + StorageIndex privateOffsetNC = 0; + bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim); +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + auto vecScalar = *in_scratch_ptr; +#else + auto vecScalar = (check_boundary(contract_conds)) + ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC, + is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0)) + : OutScalar(0); +#endif + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + auto matScalar = (check_boundary( + contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim))) + ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC + : globalNonContractDimOffset + privateOffsetNC, + is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC + : globalContractDimOffset + privateOffsetC) + : OutScalar(0); + + outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]); + privateOffsetNC += Properties::LocalThreadSizeNC; + } + privateOffsetC += Properties::LocalThreadSizeC; +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + in_scratch_ptr += Properties::LocalThreadSizeC; +#endif + } + + auto out_scratch_ptr = local_output + outScratchIndex; + // Each block of 16*16 element in shared memory should reduce to 16*1 + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + *out_scratch_ptr = outScalar[j]; + + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + if (is_lhs_vec) { + nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC; + contractId = linearLocalThreadId / Properties::LocalThreadSizeNC; + outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + } + + out_scratch_ptr = local_output + outScratchIndex; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + EIGEN_UNROLL_LOOP + for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (contractId < offset) { + StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset); + *out_scratch_ptr += out_scratch_ptr[myNeigbourId]; + } + } + // moving to next 16 by 16 block + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + + if (contractId == 0) { + out_scratch_ptr = local_output + nonContractId; + StorageIndex global_final_offset = nonContractGroupOffset + nonContractId; + out_ptr += global_final_offset; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + if (check_boundary(global_final_offset < nonContractDim)) { + auto res = *out_scratch_ptr; + + *out_ptr = res; + out_ptr += Properties::LocalThreadSizeNC; + } + // moving to next 16 by 16 block to ge the next 16 reduced elements + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC; + } + } + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, + const StorageIndex &linearLocalThreadId, + const StorageIndex &cOffset, const StorageIndex &C) { + local_ptr += InputBlockProperties::c_stride * linearLocalThreadId; + StorageIndex cIndex = cOffset; + for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) { + if (check_boundary(cIndex + InputBlockProperties::c_stride - 1 < C)) { + auto val = read(inpt, StorageIndex(0), + cIndex, StorageIndex(1)); + write(val, local_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + OutScalar val = + (cIndex + i < C) + ? read( + inpt, StorageIndex(0), cIndex + i, StorageIndex(1)) + : OutScalar(0); + write(val, local_ptr + i); + } + } + local_ptr += InputBlockProperties::c_stride * GroupSize; + cIndex += InputBlockProperties::c_stride * GroupSize; + } + } +}; +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + +/*! + * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction + * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param rng: determines the total input data size + */ +template +struct GeneralScalarContraction { + typedef cl::sycl::accessor Scratch; + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex rng; + + EIGEN_DEVICE_FUNC GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, + OutAccessor out_res_, const StorageIndex rng_) + : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {} + + EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const { + auto out_ptr = out_res; + OutScalar *scratch_ptr = scratch.get_pointer(); + + StorageIndex globalid = itemID.get_global_id(0); + StorageIndex localid = itemID.get_local_id(0); + OutScalar accumulator = OutScalar(0); + for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { + accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator); + } + auto out_scratch_ptr = scratch_ptr + localid; + *out_scratch_ptr = accumulator; + for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]); + } + } + if (localid == 0) { + out_ptr[itemID.get_group(0)] = accumulator; + } + } +}; +#endif + +} // namespace internal +} // namespace TensorSycl + +template +struct TensorEvaluator, + Eigen::SyclDevice> + : public TensorContractionEvaluatorBase, Eigen::SyclDevice>> { + static_assert(std::is_same::value, + "SYCL tensor contraction does not support output kernels."); + + typedef Eigen::SyclDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + typedef TensorContractionOp XprType; + typedef std::remove_const_t Scalar; + typedef typename XprType::Index StorageIndex; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Base::Storage Storage; + typedef typename Base::EvaluatorPointerType EvaluatorPointerType; + struct TripleDim { + const StorageIndex M; + const StorageIndex N; + const StorageIndex K; + TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {} + }; + enum { + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + }; + + static constexpr int Layout = TensorEvaluator::Layout; + static constexpr int LDims = Base::LDims; + static constexpr int RDims = Base::RDims; + static constexpr int ContractDims = Base::ContractDims; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static constexpr int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + typedef std::remove_const_t LhsScalar; + typedef std::remove_const_t RhsScalar; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + template + struct input_mapper_propertis { + static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_matrix = + (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered); + }; + + TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (!data) { + this->m_result = this->m_device.get( + static_cast(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar)))); + data = this->m_result; + } + evalToSycl(data); + return (this->m_result != NULL); + } + const Eigen::SyclDevice &device() const { return this->m_device; } + void evalToSycl(typename Base::EvaluatorPointerType buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } + } + + template + void evalTyped(typename Base::EvaluatorPointerType buffer) const { + const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size}; + typedef internal::TensorContractionInputMapper< + LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, + PacketType::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer> + LhsMapper; + + typedef internal::TensorContractionInputMapper::size, rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned, MakePointer> + RhsMapper; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + if (triple_dim.M == 1 && triple_dim.N == 1) { + launchSC(buffer, lhs, rhs, triple_dim.K); + } else +#endif +#ifndef EIGEN_SYCL_DISABLE_GEMV + if (triple_dim.M != 1 && triple_dim.N == 1) { + LaunchVT(buffer, rhs, lhs, triple_dim.M, triple_dim.K); + } else if (triple_dim.M == 1 && triple_dim.N != 1) { + LaunchVT(buffer, lhs, rhs, triple_dim.N, triple_dim.K); + } else // This is equivalent of if (m!=1 && n!=1) +#endif + { + typedef input_mapper_propertis + inpt_mapper_properties; +#ifndef EIGEN_SYCL_DISABLE_SKINNY + bool skinny = false; + auto platform_name = this->device().getPlatformName(); + // This is based on empirical calculation for AMD r9-nano and Fiji + if (platform_name.find("AMD") == 0) { + skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) && + ((triple_dim.M < 1024 && triple_dim.N < 1024) || + (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K))); + } else { + skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) || + ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) || + ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100)); + } + if (skinny) + adjustTT(buffer, lhs, rhs, triple_dim); + else +#endif // EIGEN_SYCL_DISABLE_SKINNY + adjustTT(buffer, lhs, rhs, triple_dim); + } + } + + template + void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + if (device().has_local_memory()) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF + if (!(device().has_local_memory())) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif + } + + template + void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { + const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM); + const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN); + const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM; + const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN; + + const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK); + StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK; + StorageIndex groupSizeK = + skinny + ? std::max(std::min(totalTilesK, + (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) / + (groupSizeM * groupSizeN)), + StorageIndex(1)) + : StorageIndex(1); + + const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK; + + const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK; + + const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + const StorageIndex globalRange = totalGroupSize * localRange; + + const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local) + ? ((Properties::DoubleBuffer + 1) * + (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) + + ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) * + (Properties::TileSizeDimN + Properties::BC)) + : StorageIndex(1); + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (groupSizeK == 1) { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + device() + .template binary_kernel_launcher( + lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim) + .wait(); + } else { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + CoeffReturnType *temp_pointer = static_cast( + device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device() + .template binary_kernel_launcher( + lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, + triple_dim) + .wait(); + + typedef Eigen::internal::SumReducer Op; + auto op = Op(); + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device() + .template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex( + Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))), + cl::sycl::range<1>(localRange)), + StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK) + .wait(); + device().deallocate_temp(temp_pointer); + } + } + +#ifndef EIGEN_SYCL_DISABLE_GEMV + template + void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, + StorageIndex NC, StorageIndex C) const { + const StorageIndex nonContractDim = NC; + EIGEN_CONSTEXPR StorageIndex NCFactor = 1; + EIGEN_CONSTEXPR StorageIndex CFactor = 1; + EIGEN_CONSTEXPR StorageIndex NCWindow = 16; + typedef Eigen::TensorSycl::internal::TVPanelSize + Properties; + const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC); + const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC); + const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC); + const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC); + const StorageIndex globalRange = + (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC)); + const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC; + const StorageIndex scratchSize = + (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (cNumGroups > 1) { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device() + .template binary_kernel_launcher( + vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C) + .wait(); + + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device() + .template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>( + cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)), + cl::sycl::range<1>(localRange)), + StorageIndex(1), Op(), nonContractDim, cNumGroups) + .wait(); + device().deallocate_temp(temp_pointer); + } else { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + device() + .template binary_kernel_launcher( + vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C) + .wait(); + } + } +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + template + EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + StorageIndex K) const { + EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + + // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread + // reduces at least 512 elementss individually, we get better performance. + const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1); + const StorageIndex global_range = num_work_group * local_range; + + typedef Eigen::TensorSycl::internal::GeneralScalarContraction< + CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false> + ContractKernelName; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(num_work_group * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + device() + .template binary_kernel_launcher(lhs, rhs, tmp_global_accessor, + thread_range, local_range, K) + .wait(); + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepFullReducer + GenericRKernel; + device() + .template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, + Op()) + .wait(); + device().deallocate_temp(temp_pointer); + } else { + device() + .template binary_kernel_launcher(lhs, rhs, buffer, thread_range, + local_range, K) + .wait(); + } + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { + this->m_leftImpl.cleanup(); + this->m_rightImpl.cleanup(); + + if (this->m_result) { + this->m_device.deallocate_temp(this->m_result); + this->m_result = NULL; + } + } +}; +} // namespace Eigen +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h new file mode 100644 index 00000000000..288d79f1f2a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -0,0 +1,1552 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H + +// evaluator for thread pool device +#ifdef EIGEN_USE_THREADS + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +template +struct TensorEvaluator, + ThreadPoolDevice> + : public TensorContractionEvaluatorBase, ThreadPoolDevice>> { + typedef ThreadPoolDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef std::remove_const_t Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + static constexpr int Layout = TensorEvaluator::Layout; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef std::conditional_t(Layout) == static_cast(ColMajor), LeftArgType, RightArgType> + EvalLeftArgType; + typedef std::conditional_t(Layout) == static_cast(ColMajor), RightArgType, LeftArgType> + EvalRightArgType; + + static constexpr int LDims = + internal::array_size::Dimensions>::value; + static constexpr int RDims = + internal::array_size::Dimensions>::value; + static constexpr int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static constexpr int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef std::remove_const_t LhsScalar; + typedef std::remove_const_t RhsScalar; + typedef typename internal::gebp_traits Traits; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + template + void evalProduct(Scalar* buffer) const { + evalProductImpl(buffer, NoCallback()); + } + + template + void evalProductAsync(Scalar* buffer, EvalToCallback done) const { + evalProductImpl(buffer, std::move(done)); + } + + template + void evalProductImpl(Scalar* buffer, DoneCallback done) const { + // This function computes a lot of heuristics in multiple steps, and it + // also has multiple exit points. To keep it sane, readable and all in one + // place, sync/async execution decision is made at runtime at the very end. + // + // (1) In sync mode we allocate Context on the stack, submit computations + // to the device thread pool, and block on a barrier until it is + // completed. + // + // (2) In async mode we allocate Context on the heap, and after all tasks + // are finished, we call provided the done callback, and delete a + // context from the heap. + // + // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state + // and temporary buffers, required for executing the tensor contraction. + // They are responsible for cleaning it up after contraction is done. + static const bool IsEvalInSyncMode = std::is_same::value; + + const Index m = this->m_i_size; + const Index n = this->m_j_size; + const Index k = this->m_k_size; + if (m == 0 || n == 0 || k == 0) return; + + // Compute a set of algorithm parameters: + // - kernel block sizes (bm, bn, bk) + // - task grain sizes (number of kernels executed per task: gm, gn) + // - number of threads + // - sharding by row/column + // - parallel packing or first lhs then rhs + // and some derived parameters: + // - number of tasks (nm, nn, nk) + // - number of kernels (nm0, nn0) + // Unfortunately, all these parameters are tightly interdependent. + // So in some cases we first compute approximate values, then compute other + // values based on these approximations and then refine the approximations. + + // There are lots of heuristics here. There is some reasoning behind them, + // but ultimately they are just tuned on contraction benchmarks for + // different input configurations, thread counts and instruction sets. + // So feel free to question any of them. + + // Compute whether we want to shard by row or by column. + // This is a first approximation, it will be refined later. Since we don't + // know number of threads yet we use 2, because what's we are most + // interested in at this point is whether it makes sense to use + // parallelization at all or not. + bool shard_by_col = shardByCol(m, n, 2); + + // First approximation of kernel blocking sizes. + // Again, we don't know number of threads yet, so we use 2. + Index bm, bn, bk; + if (shard_by_col) { + internal::TensorContractionBlocking blocking(k, m, n, + 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking blocking(k, m, n, + 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Compute optimal number of threads. + // Note: we use bk instead of k here because we are interested in amount of + // _parallelizable_ computations, and computations are not parallelizable + // across k dimension. + const TensorOpCost cost = contractionCost(m, n, bm, bn, bk, shard_by_col, false); + int num_threads = + TensorCostModel::numThreads(static_cast(n) * m, cost, this->m_device.numThreads()); + int num_threads_by_k = numThreadsInnerDim(m, n, k); + if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) { + // We are in the scenario where it is more effective to shard by the + // inner dimension. + if (IsEvalInSyncMode) { + EvalShardedByInnerDimContext ctx(this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx.template run(); + } else { + auto* ctx = + new EvalShardedByInnerDimContext(this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx->template runAsync(); + } + + return; + } + + // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost + // model is not tuned. Remove this when the cost model is tuned. + if (n == 1) num_threads = 1; + + if (num_threads == 1) { + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Unaligned, (buffer)); + if (!IsEvalInSyncMode) done(); + return; + } + + // Now that we know number of threads, recalculate sharding and blocking. + shard_by_col = shardByCol(m, n, num_threads); + if (shard_by_col) { + internal::TensorContractionBlocking blocking( + k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking blocking( + k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Number of kernels for each dimension. + Index nm0 = numext::div_ceil(m, bm); + Index nn0 = numext::div_ceil(n, bn); + Index nk = numext::div_ceil(k, bk); + + // Calculate task grain size (number of kernels executed per task). + // This task size coarsening serves two purposes: + // 1. It reduces per-task overheads including synchronization overheads. + // 2. It allows to use caches better (reuse the same packed rhs in several + // consecutive kernels). + Index gm = 1; + Index gn = 1; + // If we are sharding by column, then we prefer to reduce rows first. + if (shard_by_col) { + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + } else { + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + } + // Number of tasks in each dimension. + Index nm = numext::div_ceil(nm0, gm); + Index nn = numext::div_ceil(nn0, gn); + + // If there is enough concurrency in the sharding dimension, we choose not + // to paralellize by the other dimension, and execute all kernels in sync + // mode. This reduces parallelism from the nm x nn down to nn + // (shard_by_col==true) or nm (shard_by_col==false). + const Index sharding_dim_tasks = shard_by_col ? nn : nm; + const int num_worker_threads = this->m_device.numThreadsInPool(); + + // With small number of threads we want to make sure that we do not reduce + // parallelism too much. With large number of threads we trade maximum + // parallelism for better memory locality. + const float oversharding_factor = num_worker_threads <= 4 ? 8.0 + : num_worker_threads <= 8 ? 4.0 + : num_worker_threads <= 16 ? 2.0 + : num_worker_threads <= 32 ? 1.0 + : num_worker_threads <= 64 ? 0.8 + : /* num_worker_threads > 64 */ 0.6; + + const bool parallelize_by_sharding_dim_only = sharding_dim_tasks >= oversharding_factor * num_worker_threads; + + // Last by not least, decide whether we want to issue both lhs and rhs + // packing in parallel; or issue lhs packing first, and then issue rhs + // packing when lhs packing completes (for !shard_by_col lhs and rhs are + // swapped). Parallel packing allows more parallelism (for both packing and + // kernels), while sequential packing provides better locality (once + // a thread finishes rhs packing it proceed to kernels with that rhs). + // First, we are interested in parallel packing if there are few tasks. + bool parallel_pack = num_threads >= nm * nn; + // Also do parallel packing if all data fits into L2$. + if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= l2CacheSize() * num_threads) + parallel_pack = true; + // But don't do it if we will use each rhs only once. Locality seems to be + // more important in this case. + if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; + // Also don't get in the way of parallelize_by_sharding_dim_only + // optimization. + if (parallelize_by_sharding_dim_only) parallel_pack = false; + + // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext. + if (IsEvalInSyncMode) { +#define CONTEXT_ARGS \ + (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack, \ + parallelize_by_sharding_dim_only, NoCallback()) \ + .run() + TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment, CONTEXT_ARGS); +#undef CONTEXT_ARGS + + } else { +#define CONTEXT_ARGS \ + (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack, \ + parallelize_by_sharding_dim_only, std::move(done)) + TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback, Alignment, CONTEXT_ARGS, run()); +#undef CONTEXT_ARGS + } + } + + // ------------------------------------------------------------------------ // + + // Dummy struct to represent an empty DoneCallback. + + struct NoCallback { + void operator()() { eigen_assert(false && "NoCallback should never be called"); } + }; + + // ------------------------------------------------------------------------ // + + template + class EvalParallelNotification; + + // Synchronous evaluation notification that blocks caller thread in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context*, NoCallback) {} + void Notify() { done_.Notify(); } + void Wait() { done_.Wait(); } + + private: + Eigen::Notification done_; + }; + + // Asynchronous evaluation notification that does not block in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context* ctx, DoneCallback done) : ctx_(ctx), done_(std::move(done)) {} + + void Notify() { + // Make a copy of done callback, because it will be destructed when we + // will delete context in the next line (EvalParallelNotification is a + // data member of EvalParallelContext class). + DoneCallback done_copy = std::move(done_); + + // Delete parallel evaluation context. + delete ctx_; + + // Now safely call the done callback. + done_copy(); + } + + void Wait() {} + + private: + Context* ctx_; + DoneCallback done_; + }; + + // Context orchestrates sync/async parallel contraction evaluation. When it is + // executed in asynchronous mode, it owns all the shared state that might be + // accessible by block packing and kernel tasks. + + template + class EvalParallelContext { + public: + typedef internal::TensorContractionInputMapper::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + typedef internal::TensorContractionKernel + TensorContractionKernel; + + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + + EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, + Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0, + bool shard_by_col, bool parallel_pack, bool parallelize_by_sharding_dim_only, DoneCallback done) + : created_by_thread_id_(std::this_thread::get_id()), + done_(this, std::move(done)), + device_(self->m_device), + lhs_(self->m_leftImpl, self->m_left_nocontract_strides, self->m_i_strides, self->m_left_contracting_strides, + self->m_k_strides), + rhs_(self->m_rightImpl, self->m_right_nocontract_strides, self->m_j_strides, + self->m_right_contracting_strides, self->m_k_strides), + buffer_(buffer), + output_(buffer, tm), + output_kernel_(self->m_output_kernel), + tensor_contraction_params_(self->m_tensor_contraction_params), + num_threads_(num_threads), + shard_by_col_(shard_by_col), + parallel_pack_(parallel_pack), + parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only), + m_(tm), + n_(tn), + k_(tk), + bm_(bm), + bn_(bn), + bk_(bk), + nm_(nm), + nn_(nn), + nk_(nk), + gm_(gm), + gn_(gn), + nm0_(nm0), + nn0_(nn0), + kernel_(m_, k_, n_, bm_, bk_, bn_), + num_thread_local_allocations_(0), + // We reserve 2X more capacity for a thread local values, than the + // number of threads in the pool to efficiently handle task stealing + // by threads that are not managed by the pool. + thread_local_capacity(2 * (parallelize_by_sharding_dim_only_ ? device_.numThreadsInPool() : 0)), + // We will use only one of the Lhs/Rhs thread local storage depending + // on the shard_by_col value and we parallelize by sharding dim ONLY. + lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity, {*this}, {*this}), + rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0, {*this}, {*this}) { + // These two options are mutually exclusive. + eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only)); + + for (Index x = 0; x < P; x++) { + // Normal number of notifications for k slice switch is + // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only + // nm_ + nn_ notifications, because they will not receive notifications + // from preceding kernels. + state_switch_[x] = + x == 0 ? 1 : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + (x == P - 1 ? nm_ * nn_ : 0); + state_packing_ready_[x] = parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); + state_kernel_[x] = new std::atomic*[nm_]; + for (Index m = 0; m < nm_; m++) { + state_kernel_[x][m] = new std::atomic[nn_]; + // Kernels generally receive 3 notifications (previous kernel + 2 + // packing), but the first slice won't get notifications from previous + // kernels. + for (Index n = 0; n < nn_; n++) + state_kernel_[x][m][n].store((x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), std::memory_order_relaxed); + } + } + + // Allocate memory for packed rhs/lhs matrices. + packed_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/nm0_, // + /*num_rhs=*/nn0_, // + /*num_slices=*/std::min(nk_, P - 1), // + packed_lhs_, packed_rhs_); + + if (parallelize_by_sharding_dim_only_) { + const int num_worker_threads = device_.numThreadsInPool(); + + if (shard_by_col) { + can_use_thread_local_packed_ = new std::atomic[nn_]; + for (int i = 0; i < nn_; ++i) can_use_thread_local_packed_[i].store(true, std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gn_; + thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/0, // + /*num_rhs=*/num_blocks, // + /*num_slices=*/1, // + /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_); + + } else { + can_use_thread_local_packed_ = new std::atomic[nm_]; + for (int i = 0; i < nm_; ++i) can_use_thread_local_packed_[i].store(true, std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gm_; + thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/num_blocks, // + /*num_rhs=*/0, // + /*num_slices=*/1, &lhs_thread_local_pre_allocated_, // + /*rhs_blocks=*/nullptr); + } + } + } + + ~EvalParallelContext() { + for (Index x = 0; x < P; x++) { + for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; + delete[] state_kernel_[x]; + } + kernel_.deallocate(device_, packed_mem_); + if (parallelize_by_sharding_dim_only_) { + kernel_.deallocate(device_, thread_local_pre_alocated_mem_); + delete[] can_use_thread_local_packed_; + } + } + + void run() { + // Kick off packing of the first slice. + signal_switch(0, 1); + + // Wait for overall completion. + // + // If parallel evaluation is executed in async mode, this is a no-op, and + // Wait() will return immediately. In synchronous mode it will block the + // caller thread until it will receive notification from last task. + // + // In async mode, last task when completed will call done callback from + // the same thread, and will delete this context. + // + // TODO(dvyukov): This wait can lead to deadlock if contraction is + // evaluated in synchronous mode. If nthreads contractions are + // concurrently submitted from worker threads, this wait will block all + // worker threads and the system will deadlock. + done_.Wait(); + } + + private: + std::thread::id created_by_thread_id_; + + // This notification is specialized on the type of DoneCallback and can be + // blocking or non-blocking. + EvalParallelNotification done_; + + const Device& device_; + LhsMapper lhs_; + RhsMapper rhs_; + Scalar* const buffer_; + OutputMapper output_; + OutputKernelType output_kernel_; + TensorContractionParams tensor_contraction_params_; + const int num_threads_; + const bool shard_by_col_; + const bool parallel_pack_; + const bool parallelize_by_sharding_dim_only_; + // Matrix sizes. + const Index m_; + const Index n_; + const Index k_; + // Block sizes. + const Index bm_; + const Index bn_; + const Index bk_; + // Number of tasks. + const Index nm_; + const Index nn_; + const Index nk_; + // Task grain sizes (number of kernels executed per task). + const Index gm_; + const Index gn_; + // Number of blocks (this is different from ni_/nn_ because of task size + // coarsening). + const Index nm0_; + const Index nn0_; + // Tensor contraction kernel. + TensorContractionKernel kernel_; + + // Parallelization strategy. + // + // Blocks related to the same k block can run in parallel because they write + // to different output blocks. So we parallelize within k slices, this + // gives us parallelism level of m x n. Before we can start any kernels + // related to k-th slice, we need to issue m lhs packing tasks and n rhs + // packing tasks. + // + // However, there is a bottleneck when we are finishing kernels for k-th + // slice (at the very end there is only 1 runnable kernel). To mitigate this + // bottleneck we allow kernels from k-th and k+1-th slices to run in + // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same + // output block, so they must not run in parallel. + // + // This gives us the following dependency graph. + // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs + // packing tasks. + // Kernel (m, n, k) can start when: + // - kernel (m, n, k-1) has finished + // - lhs packing (m, k) has finished + // - rhs packing (n, k) has finished + // Lhs/rhs packing can start when: + // - all k-1 packing has finished (artificially imposed to limit amount of + // parallel packing) + // + // On top of that we limit runnable tasks to two consecutive k slices. + // This is done to limit amount of memory we need for packed lhs/rhs + // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). + // + // state_switch_ tracks when we are ready to switch to the next k slice. + // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). + // These variable are rolling over 3 consecutive k slices: first two we are + // actively executing + one to track completion of kernels in the second + // slice. + static constexpr Index P = 3; + + // Handle to the allocated temporary storage for Lhs/Rhs blocks. + BlockMemHandle packed_mem_; + std::vector packed_lhs_[P - 1]; + std::vector packed_rhs_[P - 1]; + + // If we choose to parallelize only by the sharding dimension, each thread + // will have it's own "thead local" (not a c++ thread local storage) memory + // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory + // can't be passed to a kernel that might execute on a different thread. + // + // In practice when we are ready to pack memory for the sharding dimension + // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice + // already computed (99% of the time), and we can pack data into the thread + // local storage, and guarantee that all the kernels will be executed + // immediately in the same thread. This significantly increases L1 cache hit + // ratio and reduces pressure on the memory bus. + // + // It's still possible that kernel for the K-th slice will be ready before + // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_ + // and packed_rhs_ to allow kernels to be executed later on a thread + // different from the thread that was used for packing. + + // Handle for pre-allocated thread local memory buffers. + BlockMemHandle thread_local_pre_alocated_mem_; + + // Only one of these will be initialized depending on shard_by_col value + // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`). + std::vector lhs_thread_local_pre_allocated_; + std::vector rhs_thread_local_pre_allocated_; + + // How many thread local blocks were already allocated. + std::atomic num_thread_local_allocations_; + const int thread_local_capacity; + + // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of + // unique threads in a system is below or equal to the number of threads in + // a thread pool. We will fallback on dynamic memory allocation after that. + + // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its + // size is equal to the grain size in Lhs/Rhs sharding dimension. + template + class ThreadLocalBlocks { + public: + ThreadLocalBlocks() = default; + + ThreadLocalBlocks(BlockType* base, size_t grain_size) + : is_pre_allocated_(true), thread_local_pre_allocated_base_(base), grain_size_(grain_size) {} + + ThreadLocalBlocks(BlockMemHandle mem_handle, std::vector blocks) + : is_pre_allocated_(false), mem_handle_(std::move(mem_handle)), blocks_(std::move(blocks)) {} + + BlockType& block(int grain_index) { + eigen_assert(grain_index >= 0); + eigen_assert(static_cast(grain_index) < size()); + return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index] : blocks_[grain_index]; + } + + void Release(EvalParallelContext& ctx) const { + if (!is_pre_allocated_) { + ctx.kernel_.deallocate(ctx.device_, mem_handle_); + } + } + + size_t size() const { return is_pre_allocated_ ? grain_size_ : blocks_.size(); } + + private: + bool is_pre_allocated_; + + // Reuse pre-allocated thread local buffers. + BlockType* thread_local_pre_allocated_base_ = nullptr; + size_t grain_size_ = 0; + + // These will be initialized only if `is_pre_allocated == false`. + BlockMemHandle mem_handle_{}; + std::vector blocks_; + }; + + // ThreadLocalBlocksInitialize callable does custom thread local blocks + // initialization, and will reuse pre-allocated buffers if possible, or will + // dynamically allocate new memory. + // + // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly + // for what side do we plan to do block allocation. + template + class ThreadLocalBlocksInitialize { + static constexpr bool kIsLhs = !is_rhs && std::is_same::value; + static const bool kIsRhs = is_rhs && std::is_same::value; + static_assert(kIsLhs || kIsRhs, "Unknown block type"); + + using Blocks = ThreadLocalBlocks; + + public: + ThreadLocalBlocksInitialize(EvalParallelContext& ctx) + : ctx_(ctx), num_worker_threads_(ctx_.device_.numThreadsInPool()) {} + + void operator()(Blocks& blocks) { + const int n = ctx_.num_thread_local_allocations_.fetch_add(1, std::memory_order_relaxed); + + if (n >= num_worker_threads_) { + ThreadLocalBlocksAllocator::allocate(ctx_, blocks); + } else { + ThreadLocalBlocksAllocator::reuse(ctx_, n, blocks); + } + } + + private: + // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to + // TensorContractionKernel::allocateSlices into template specializations. + // Also explicit specializations are not allowed at class scope in C++03, + // EvalCtx type parameter is just a workaround for that limitation. + template + struct ThreadLocalBlocksAllocator; + + template + struct ThreadLocalBlocksAllocator { + static void allocate(EvalCtx& ctx, Blocks& blocks) { + std::vector rhs_blocks; + BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(ctx.device_, + /*num_lhs=*/0, + /*num_rhs=*/ctx.gn_, + /*num_slices=*/1, + /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks); + + blocks = ThreadLocalBlocks(std::move(mem_handle), std::move(rhs_blocks)); + } + + static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { + RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index]; + blocks = ThreadLocalBlocks(ptr, ctx.gn_); + } + }; + + template + struct ThreadLocalBlocksAllocator { + static void allocate(EvalCtx& ctx, Blocks& blocks) { + std::vector lhs_blocks; + BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(ctx.device_, + /*num_lhs=*/ctx.gm_, + /*num_rhs=*/0, + /*num_slices=*/1, + /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr); + + blocks = ThreadLocalBlocks(std::move(mem_handle), std::move(lhs_blocks)); + } + + static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { + LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index]; + blocks = ThreadLocalBlocks(ptr, ctx.gm_); + } + }; + + EvalParallelContext& ctx_; + const int num_worker_threads_; + }; + + template + class ThreadLocalBlocksRelease { + public: + using Blocks = ThreadLocalBlocks; + ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {} + void operator()(Blocks& blocks) { blocks.Release(ctx_); } + + private: + EvalParallelContext& ctx_; + }; + + // ThreadLocalBlocks initialization callables. + using ThreadLocalLhsInit = ThreadLocalBlocksInitialize; + using ThreadLocalRhsInit = ThreadLocalBlocksInitialize; + + // ThreadLocalBlocks release callables. + using ThreadLocalLhsRelease = ThreadLocalBlocksRelease; + using ThreadLocalRhsRelease = ThreadLocalBlocksRelease; + + // Thread local containers for Lhs/Rhs block packs. In practice only one of + // them will be used, depending on the shard_by_col value. + Eigen::ThreadLocal, ThreadLocalLhsInit, ThreadLocalLhsRelease> lhs_thread_local_blocks_; + Eigen::ThreadLocal, ThreadLocalRhsInit, ThreadLocalRhsRelease> rhs_thread_local_blocks_; + + // After a particular shard for Kth slice missed thread local execution + // opportunity (K-1 slice didn't complete kernels execution), we can no + // longer schedule K+1 and following slices in thread local mode, because + // there is no more guarantee that previous kernels were executed + // sequentially in the same thread (size is nn_ or nm_). + std::atomic* can_use_thread_local_packed_; + + std::atomic** state_kernel_[P]; + // state_switch_ is frequently modified by worker threads, while other + // fields are read-only after constructor. Let's move it to a separate cache + // line to reduce cache-coherency traffic. + char pad_[128]; + std::atomic state_packing_ready_[P]; + std::atomic state_switch_[P]; + + LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(!shard_by_col_); + ThreadLocalBlocks& blocks = lhs_thread_local_blocks_.local(); + + Index grain_index = m1 - m * gm_; + return blocks.block( + internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? + } else { + return packed_lhs_[k % (P - 1)][m1]; + } + } + + RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(shard_by_col_); + ThreadLocalBlocks& blocks = rhs_thread_local_blocks_.local(); + + Index grain_index = n1 - n * gn_; + return blocks.block( + internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? + } else { + return packed_rhs_[k % (P - 1)][n1]; + } + } + + // In following two methods (pack_lhs and pack_rhs), if we know for sure + // that we'll be able to immediately call a kernel with packed data, and do + // not submit it to the thread pool, we can use thread local memory for + // packed data. + // + // We can only reliably check it if we are running all kernels in sync mode + // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to + // run, it's guaranteed that all kernels with larger values of m (n) are + // also ready, because we execute them in the same order for all K slices. + + void pack_lhs(Index m, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && !shard_by_col_ && + can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in following slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[m].store(false, std::memory_order_relaxed); + } + } + + const Index mend = m * gm_ + gm(m); + for (Index m1 = m * gm_; m1 < mend; m1++) + kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local), lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + + if (!parallel_pack_ && shard_by_col_) { + eigen_assert(!use_thread_local); + signal_packing(k); + } else { + signal_switch(k + 1); + for (Index n = nn_ - 1; n >= 0; n--) { + bool sync = parallelize_by_sharding_dim_only_ || n == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } + } + } + + void pack_rhs(Index n, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && shard_by_col_ && + can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in following slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[n].store(false, std::memory_order_relaxed); + } + } + + const Index nend = n * gn_ + gn(n); + for (Index n1 = n * gn_; n1 < nend; n1++) { + if (!TensorContractionKernel::HasBeta && k == 0) { + // Zero the output memory in parallel, only if contraction kernel does + // not support `beta`. Otherwise we will pass beta 0.0 to the first + // call to the `TensorContractionKernel::invoke()`. + // + // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn + // x m) row. Safe to do here because all kernels that will write to + // this memory depend on completion of this task. Note: don't call + // device_.fill() here. device_.fill() blocks on thread pool + // worker thread, which can lead to underutilization and deadlocks. + std::fill_n(buffer_ + n1 * bn_ * m_, bn(n1) * m_, Scalar(0)); + } + kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local), rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + } + + if (parallel_pack_ || shard_by_col_) { + signal_switch(k + 1); + for (Index m = nm_ - 1; m >= 0; m--) { + bool sync = parallelize_by_sharding_dim_only_ || m == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } + } else { + eigen_assert(!use_thread_local); + signal_packing(k); + } + } + + void kernel(Index m, Index n, Index k, bool use_thread_local) { + // Note: order of iteration matters here. Iteration over m is innermost + // because we want to reuse the same packed rhs in consecutive tasks + // (rhs fits into L2$ while lhs only into L3$). + const Index nend = n * gn_ + gn(n); + const Index mend = m * gm_ + gm(m); + + // NOTE: output = alpha * LHS * RHS + beta * output. + const Scalar alpha = Scalar(1); + const Scalar beta = (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1); + + if (shard_by_col_) { + for (Index n1 = n * gn_; n1 < nend; n1++) { + for (Index m1 = m * gm_; m1 < mend; m1++) { + const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); + kernel_.invoke(output_mapper, packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), bk(k), bn(n1), alpha, beta); + + // We are done with the last task for the [m1, n1] block. + if (k + 1 == nk_) { + output_kernel_(output_mapper, tensor_contraction_params_, m1 * bm_, n1 * bn_, bm(m1), bn(n1)); + } + } + } + } else { + for (Index m1 = m * gm_; m1 < mend; m1++) + for (Index n1 = n * gn_; n1 < nend; n1++) { + const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); + kernel_.invoke(output_mapper, packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), bk(k), bn(n1), alpha, beta); + + // We are done with the last task for the [m1, n1] block. + if (k + 1 == nk_) { + output_kernel_(output_mapper, tensor_contraction_params_, m1 * bm_, n1 * bn_, bm(m1), bn(n1)); + } + } + } + signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false); + signal_switch(k + 2); + } + + void signal_packing(Index k) { + eigen_assert(!parallel_pack_); + Index s = state_packing_ready_[k % P].fetch_sub(1); + eigen_assert(s > 0); + if (s != 1) return; + state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; + enqueue_packing(k, shard_by_col_); + } + + void signal_kernel(Index m, Index n, Index k, bool sync, bool use_thread_local) { + std::atomic* state = &state_kernel_[k % P][m][n]; + Index s = state->load(); + eigen_assert(s > 0); + if (s != 1 && state->fetch_sub(1) != 1) { + eigen_assert(!use_thread_local); + return; + } + state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); + if (sync) { + kernel(m, n, k, use_thread_local); + } else { + eigen_assert(!use_thread_local); + device_.enqueueNoNotification([this, m, n, k, use_thread_local]() { + kernel(m, n, k, use_thread_local); + }); + } + } + + void signal_switch(Index k, Index v = 1) { + Index s = state_switch_[k % P].fetch_sub(v); + eigen_assert(s >= v); + if (s != v) return; + + // Ready to switch to the next k slice. + // Reset counter for the next iteration. + state_switch_[k % P] = (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + nm_ * nn_; + if (k < nk_) { + // Issue lhs/rhs packing. Their completion will in turn kick off + // kernels. + if (parallel_pack_) { + enqueue_packing(k, !shard_by_col_); + enqueue_packing(k, shard_by_col_); + } else if (shard_by_col_) { + enqueue_packing(k, false); + } else { + enqueue_packing(k, true); + } + + // Termination handling. + // Because kernel completion signals k + 2 switch, we need to finish nk + // + 2 slices without issuing any tasks on nk + 1 slice. So here we + // pretend that all nk + 1 packing tasks just finish instantly; so that + // nk + 2 switch only waits for completion of nk kernels. + } else if (k == nk_) { + signal_switch(k + 1, parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); + } else { + done_.Notify(); + } + } + + // Enqueue all rhs/lhs packing for k-th slice. + void enqueue_packing(Index k, bool rhs) { enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); } + + void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { + if (end - start == 1) { + if (rhs) + pack_rhs(start, k); + else + pack_lhs(start, k); + } else { + while (end - start > 1) { + Index mid = (start + end) / 2; + device_.enqueueNoNotification([this, mid, end, k, rhs]() { + enqueue_packing_helper(mid, end, k, rhs); + }); + end = mid; + } + + // Decide if we want to run first packing task (start == 0) in + // async mode if we parallelize only by sharding dim: + // (1) pack_lhs and pack_rhs call signal_switch before completing + // all calls to signal_kernel, which in sync mode might lead + // to the execution of the first kernel of the k+1 slice, before + // completing a call to the last kernel of the k slice. + // (2) all pack tasks for sharded dim must be executed in a thread + // pool to get pre-allocated thead local buffers. + bool pack_async = (start == 0) && (parallelize_by_sharding_dim_only_ && shard_by_col_ == rhs) && + (k > 0 || std::this_thread::get_id() == created_by_thread_id_); + + if (pack_async) { + device_.enqueueNoNotification([this, start, end, k, rhs]() { + enqueue_packing_helper(start, end, k, rhs); + }); + } else { + enqueue_packing_helper(start, end, k, rhs); + } + } + } + + // Block sizes with accounting for potentially incomplete last block. + Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } + Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } + Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } + // Task grain sizes accounting for potentially incomplete last task. + Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } + Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } + + EvalParallelContext(const EvalParallelContext&) = delete; + void operator=(const EvalParallelContext&) = delete; + }; + + template + using SyncEvalParallelContext = EvalParallelContext; + + // ------------------------------------------------------------------------ // + + // EvalShardedByInnerDimContext orchestrates sync/async contraction + // evaluation, when we shard by inner dimension. When it is executed in + // asynchronous mode, it owns all the shared state that might be accessible by + // block processing tasks. + + template + struct EvalShardedByInnerDimContext { + EvalShardedByInnerDimContext(const Self* self, int num_threads, Scalar* result_buffer, Index m_size, Index n_size, + Index k_size, DoneCallback done_callback) + : evaluator(self), + m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous), + m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous), + m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered), + result(result_buffer), + m(m_size), + n(n_size), + k(k_size), + done(std::move(done_callback)), + buffer_size_bytes(m * n * sizeof(Scalar)), + block_size(blockSize(k, num_threads)), + num_blocks(numext::div_ceil(k, block_size)), + num_pending_blocks(internal::convert_index(num_blocks)), + l0_ranges(numext::div_ceil(num_blocks, l0_size)), + l0_state(l0_ranges), + block_buffers(num_blocks) { + // Keep count of pending gemm tasks for each l0 range. + for (int i = 0; i < l0_ranges; ++i) { + const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i); + l0_state.emplace_back(internal::convert_index(num_pending_tasks)); + } + + // Allocate temporary buffers for each block. + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + Scalar* buf = block_idx == 0 ? result : static_cast(evaluator->m_device.allocate(buffer_size_bytes)); + block_buffers.emplace_back(buf); + } + } + + ~EvalShardedByInnerDimContext() { + for (Index i = 1; i < num_blocks; ++i) { + evaluator->m_device.deallocate(block_buffers[i]); + } + } + + template + void run() { + Barrier barrier(internal::convert_index(num_blocks)); + eval(barrier, 0, num_blocks); + barrier.Wait(); + + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + } + + template + void runAsync() { + evalAsync(0, num_blocks); + } + + private: + // The underlying GEMM kernel assumes that k is a multiple of + // the packet size and subtle breakage occurs if this is violated. + static const Index packet_size = internal::packet_traits::size; + + const Self* evaluator; // TensorContraction evaluator + + // These fields required fromTENSOR_CONTRACTION_DISPATCH macro. + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + Scalar* result; + + Index m; + Index n; + Index k; + + DoneCallback done; + + // ----------------------------------------------------------------------// + // Algorithm parameters. + + // We will compute partial results into the buffers of this size. + Index buffer_size_bytes; + + Index block_size; + Index num_blocks; + + // Keep track of pending tasks when evaluate in async mode. + std::atomic num_pending_blocks; + + // We compute partial gemm results in parallel, and to get the final result + // we need to add them all together. For the large number of threads (>= 48) + // this adds a very expensive sequential step at the end. + // + // We split the [0, num_blocks) into small ranges, and when a task for the + // block finishes its partial gemm computation, it checks if it was the last + // gemm in the range, and if so, it will add all blocks of the range. + // + // After all tasks done, we need to add only these pre-aggregated blocks. + + // For now we use just a single level of ranges to compute pre-aggregated + // partial sums, but in general we can use more layers to compute tree + // aggregation in parallel and reduce the size of the sequential step. + // + // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make + // sense only if number of threads >= ~128? + static const Index l0_size = 4; + Index l0_ranges; + + // Keep count of pending gemm tasks for each l0 range. + MaxSizeVector> l0_state; // [0, l0_ranges) + + // Buffers allocated for each temporary block computation. + MaxSizeVector block_buffers; // [0, num_blocks) + + template + void processBlock(Index block_idx, Index begin, Index end) { + Scalar* buf = block_buffers[block_idx]; + + TENSOR_CONTRACTION_DISPATCH(evaluator->template evalGemmPartialWithoutOutputKernel, Alignment, + (buf, begin, end, + /*num_threads=*/internal::convert_index(num_blocks))); + + // Check if it was the last task in l0 range. + const Index l0_index = block_idx / l0_size; + const int v = l0_state[l0_index].fetch_sub(1); + eigen_assert(v >= 1); + + // If we processed the last block of the range, we can aggregate all + // partial results into the first block of the range. + if (v == 1) { + const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index); + const Index dst_block_idx = l0_index * l0_size; + + if (rng_size == l0_size) { + addAllToBuffer(m * n, + /*src_buf0=*/block_buffers[dst_block_idx + 1], + /*src_buf1=*/block_buffers[dst_block_idx + 2], + /*src_buf2=*/block_buffers[dst_block_idx + 3], + /*dst_buf= */ block_buffers[dst_block_idx]); + } else { + // Aggregate blocks of potentially incomplete last range. + for (int i = 1; i < rng_size; ++i) { + addToBuffer(m * n, + /*src_buf=*/block_buffers[dst_block_idx + i], + /*dst_buf=*/block_buffers[dst_block_idx]); + } + } + } + } + + // Aggregate partial sums from l0 ranges. + template + void aggregateL0Blocks() const { + Index l0_index = 1; + + for (; l0_index + 2 < l0_ranges; l0_index += 3) { + addAllToBuffer(m * n, + /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], + /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], + /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], + /*dst_buf= */ block_buffers[0]); + } + + for (; l0_index < l0_ranges; ++l0_index) { + addToBuffer(m * n, block_buffers[l0_index * l0_size], block_buffers[0]); + } + } + + void applyOutputKernel() const { + typedef internal::blas_data_mapper OutputMapper; + evaluator->m_output_kernel(OutputMapper(result, m), evaluator->m_tensor_contraction_params, + static_cast(0), static_cast(0), m, n); + } + + // Compute block size with accounting for potentially incomplete last block. + Index actualBlockSize(Index block_idx) const { + return block_idx + 1 < num_blocks ? block_size : k + block_size - block_size * num_blocks; + }; + + // Compute range size with accounting for potentially incomplete last range. + Index actualRangeSize(Index num_ranges, Index range_size, Index range_idx) const { + eigen_assert(range_idx < num_ranges); + return range_idx + 1 < num_ranges ? range_size : num_blocks + range_size - range_size * num_ranges; + }; + + template + EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf, Scalar* tgt_buf) { + const int output_packet_size = internal::unpacket_traits::size; + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const PacketReturnType src_val = internal::pload(src_buf + i); + const PacketReturnType tgt_val = internal::ploadt(tgt_buf + i); + const PacketReturnType sum = internal::padd(src_val, tgt_val); + internal::pstoret(tgt_buf + i, sum); + } + for (; i < n; ++i) { + tgt_buf[i] += src_buf[i]; + } + } + + template + EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n, const Scalar* src_buf0, const Scalar* src_buf1, + const Scalar* src_buf2, Scalar* dst_buf) { + using ::Eigen::internal::padd; + using ::Eigen::internal::pload; + using ::Eigen::internal::ploadt; + using ::Eigen::internal::pstoret; + + const int output_packet_size = internal::unpacket_traits::size; + + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const auto src_val0 = pload(src_buf0 + i); + const auto src_val1 = pload(src_buf1 + i); + const auto src_val2 = pload(src_buf2 + i); + + const auto dst_val = ploadt(dst_buf + i); + const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); + + pstoret(dst_buf + i, sum); + } + for (; i < n; ++i) { + dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; + } + } + + template + void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) { + while (end_block_idx - start_block_idx > 1) { + Index mid_block_idx = (start_block_idx + end_block_idx) / 2; + evaluator->m_device.enqueueNoNotification([this, &barrier, mid_block_idx, end_block_idx]() { + eval(barrier, mid_block_idx, end_block_idx); + }); + end_block_idx = mid_block_idx; + } + + Index block_idx = start_block_idx; + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + barrier.Notify(); + } + + template + void evalAsync(Index start_block_idx, Index end_block_idx) { + while (end_block_idx - start_block_idx > 1) { + Index mid_block_idx = (start_block_idx + end_block_idx) / 2; + evaluator->m_device.enqueueNoNotification( + [this, mid_block_idx, end_block_idx]() { + evalAsync(mid_block_idx, end_block_idx); + }); + end_block_idx = mid_block_idx; + } + + Index block_idx = start_block_idx; + + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + + int v = num_pending_blocks.fetch_sub(1); + eigen_assert(v >= 1); + + if (v == 1) { + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + + // NOTE: If we call `done` callback before deleting this (context), + // it might deallocate Self* pointer captured by context, and we'll + // fail in destructor trying to deallocate temporary buffers. + + // Move done call back from context before it will be destructed. + DoneCallback done_copy = std::move(done); + + // We are confident that we are the last one who touches context. + delete this; + + // Now safely call the done callback. + done_copy(); + } + } + + // Cost model doesn't capture well the cost associated with constructing + // tensor contraction mappers and computing loop bounds in gemm_pack_lhs + // and gemm_pack_rhs, so we specify minimum desired block size. + static Index blockSize(Index k, int num_threads) { + const auto round_up = [=](Index index) -> Index { + const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + return numext::div_ceil(index, kmultiple) * kmultiple; + }; + + const Index target_block_size = round_up(numext::div_ceil(k, num_threads)); + const Index desired_min_block_size = 12 * packet_size; + + return numext::mini(k, numext::maxi(desired_min_block_size, target_block_size)); + } + + EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete; + void operator=(const EvalShardedByInnerDimContext&) = delete; + }; + + // ------------------------------------------------------------------------ // + + // Below are the function used by evalProductImpl heuristics, trying to select + // optimcal parameters for parallelization algorithm. + + // Decide whether we want to shard m x n contraction by columns or by rows. + static bool shardByCol(Index m, Index n, Index num_threads) { + // Note: we are comparing both n and m against Traits::nr, it is not + // a mistake. We are trying to figure out how both n and m will fit into + // the main sharding dimension. + + // Sharding by column is the default + // ... unless there is enough data for vectorization over rows + if (m / num_threads >= Traits::nr && + // and not enough data for vectorization over columns + (n / num_threads < Traits::nr || + // ... or barely enough data for vectorization over columns, + // but it is not evenly dividable across threads + (n / num_threads < 4 * Traits::nr && (n % (num_threads * Traits::nr)) != 0 && + // ... and it is evenly dividable across threads for rows + ((m % (num_threads * Traits::nr)) == 0 || + // .. or it is not evenly dividable for both dimensions but + // there is much more data over rows so that corner effects are + // mitigated. + (m / n >= 6))))) + return false; + // Wait, or if matrices are just substantially prolonged over the other + // dimension. + if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; + return true; + } + + Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, int num_threads, bool shard_by_col) const { + Index gm = 1; + Index gm1 = 1; + Index nm0 = numext::div_ceil(m, bm); + Index nm1 = nm0; + for (;;) { + // Find the next candidate for m grain size. It needs to result in + // different number of blocks. E.g. if we have 10 kernels, we want to try + // 5 and 10, but not 6, 7, 8 and 9. + while (gm1 <= nm0 && nm1 == numext::div_ceil(nm0, gm1)) gm1++; + if (gm1 > nm0) break; + // Check the candidate. + int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, shard_by_col); + if (res < 0) break; + nm1 = numext::div_ceil(nm0, gm1); + if (res == 0) continue; + // Commit new grain size. + gm = gm1; + } + return gm; + } + + Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, int num_threads, bool shard_by_col) const { + Index gn = 1; + Index gn1 = 1; + Index nn0 = numext::div_ceil(n, bn); + Index nn1 = nn0; + for (;;) { + while (gn1 <= nn0 && nn1 == numext::div_ceil(nn0, gn1)) gn1++; + if (gn1 > nn0) break; + int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, shard_by_col); + if (res < 0) break; + nn1 = numext::div_ceil(nn0, gn1); + if (res == 0) continue; + gn = gn1; + } + return gn; + } + + // checkGrain checks whether grain (gm, gn) is suitable and is better than + // (oldgm, oldgn). + int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, Index gn, Index oldgm, Index oldgn, + int num_threads, bool shard_by_col) const { + const TensorOpCost cost = contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); + double taskSize = TensorCostModel::taskSize(static_cast(bm) * gm * bn * gn, cost); + // If the task is too small, then we agree on it regardless of anything + // else. Otherwise synchronization overheads will dominate. + if (taskSize < 1) return 1; + // If it is too large, then we reject it and all larger tasks. + if (taskSize > 2) return -1; + // Now we are in presumably good task size range. + // The main deciding factor here is parallelism. Consider that we have 12 + // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. + // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 + // of cores will be busy). While grain size 3 gives us 4 tasks, which gives + // us parallelism of 1 (we can load all cores). + Index nm0 = numext::div_ceil(m, bm); + Index nn0 = numext::div_ceil(n, bn); + Index new_tasks = numext::div_ceil(nm0, gm) * numext::div_ceil(nn0, gn); + double new_parallelism = + static_cast(new_tasks) / (numext::div_ceil(new_tasks, num_threads) * num_threads); + Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn); + double old_parallelism = + static_cast(old_tasks) / (numext::div_ceil(old_tasks, num_threads) * num_threads); + if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; + return 0; + } + + TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, bool shard_by_col, + bool prepacked) const { + const int packed_size = std::min(PacketType::size, PacketType::size); + const int output_packet_size = internal::unpacket_traits::size; + const double kd = static_cast(bk); + double compute_bandwidth = computeBandwidth(false, bm, bn, bk); + // Computations. + TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + if (prepacked) { + // Packing and kernels are executed in different tasks. When we calculate + // task grain size we look only at kernel cost assuming that kernel + // is more expensive than packing. + return cost; + } + // Lhs/rhs loads + computations. + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); + // Lhs packing memory cost does not contribute considerably to overall + // execution time because lhs is prefetched early and accessed sequentially. + if (shard_by_col) + lhsCost.dropMemoryCost(); + else + rhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } + + // Decide whether we want to shard m x k x n contraction over the inner + // (contraction) dimension (k). + static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, int num_threads_by_k) { + std::ptrdiff_t bufsize = m * n * sizeof(Scalar); + bool shard_by_k = false; + if (n == 1 || // If mat*vec or... + num_threads_by_k < 2 || // running single threaded or... + num_threads_by_k < num_threads || // sharding by k gives less parallelism or... + bufsize > l3CacheSize() / num_threads_by_k || // need more buffer space + // than L3 cache or... + k / num_threads_by_k < 2 * Traits::nr) { // k per thread is tiny. + shard_by_k = false; + } else if (numext::maxi(m, n) / num_threads < Traits::nr || // both other dimensions are tiny or... + // k per thread is not small and... + (k / num_threads_by_k > 8 * Traits::nr && + // one of the outer dimensions is tiny or sharding by k offers + // more parallelism. + (numext::mini(m, n) < 2 * Traits::nr || num_threads_by_k > num_threads))) { + shard_by_k = true; + } + return shard_by_k; + } + + TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const { + // Compute cost. + const int output_packet_size = internal::unpacket_traits::size; + TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m; + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n; + // Since the inner gemm kernel is always sharded by column, the lhs + // load cost is negligible. + lhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } + + int numThreadsInnerDim(Index m, Index n, Index k) const { + const int output_packet_size = internal::unpacket_traits::size; + TensorOpCost cost = contractionCostPerInnerDim(m, n, k); + double total_parallel_cost = TensorCostModel::totalCost(k, cost); + // Cost of reduction step accumulating the m*n per-thread buffers into the + // result. + double reduction_cost = + TensorCostModel::totalCost(m * n, TensorOpCost(2, 1, 1, true, output_packet_size)); + int num_threads = 1; + double min_cost = total_parallel_cost; + double kPerThreadOverHead = 3000; + double kFixedOverHead = 100000; + for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) { + double sequential_cost = kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead); + double parallel_cost = total_parallel_cost / nt + sequential_cost; + if (parallel_cost < min_cost) { + num_threads = nt; + min_cost = parallel_cost; + } + } + return num_threads; + } + + double computeBandwidth(bool shard_by_col, Index bm, Index bn, Index bk) const { + // Peak VFMA bandwidth is 0.5. However if we have not enough data for + // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined + // experimentally. + double computeBandwidth = bk == 1 ? 4.0 + : (shard_by_col ? bn : bm) < Traits::nr || (shard_by_col ? bm : bn) < Traits::mr ? 2.0 + : 0.5; +#ifndef EIGEN_VECTORIZE_FMA + // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. + // However for MULPS/ADDPS we have dependent sequence of 2 such + // instructions, + // so overall bandwidth is 1.0. + if (computeBandwidth == 0.5) computeBandwidth = 1.0; +#endif + return computeBandwidth; + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_THREADS +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h new file mode 100644 index 00000000000..5e4a586d8b1 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -0,0 +1,416 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorConversionOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor conversion class. This class makes it possible to vectorize + * type casting operations when the number of scalars per packet in the source + * and the destination type differ + */ +namespace internal { +template +struct traits > { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef TargetType Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = traits::Layout; + enum { Flags = 0 }; + typedef typename TypeConversion::PointerType>::type PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorConversionOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorConversionOp type; +}; + +} // end namespace internal + +template +struct PacketConverter; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + return internal::pcast(m_impl.template packet(index)); + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + 1 * SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + SrcPacket src5 = m_impl.template packet(index + 4 * SrcPacketSize); + SrcPacket src6 = m_impl.template packet(index + 5 * SrcPacketSize); + SrcPacket src7 = m_impl.template packet(index + 6 * SrcPacketSize); + SrcPacket src8 = m_impl.template packet(index + 7 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4, src5, src6, src7, src8); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) + : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + // Only call m_impl.packet() when we have direct access to the underlying data. This + // ensures that we don't compute the subexpression twice. We may however load some + // coefficients twice, but in practice this doesn't negatively impact performance. + if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) { + // Force unaligned memory loads since we can't ensure alignment anymore + return internal::pcast(m_impl.template packet(index)); + } else { + const int TgtPacketSize = internal::unpacket_traits::size; + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TgtType; + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < TgtPacketSize; ++i) { + values[i] = converter(m_impl.coeff(index + i)); + } + TgtPacket rslt = internal::pload(values); + return rslt; + } + } + + private: + const TensorEvaluator& m_impl; + const typename TensorEvaluator::Index m_maxIndex; +}; + +template +class TensorConversionOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef Scalar CoeffReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) : m_xpr(xpr) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +template +struct ConversionSubExprEval { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { + impl.evalSubExprsIfNeeded(NULL); + return true; + } +}; + +template +struct ConversionSubExprEval { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { return impl.evalSubExprsIfNeeded(data); } +}; + +#ifdef EIGEN_USE_THREADS +template +struct ConversionSubExprEvalAsync { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) { + impl.evalSubExprsIfNeededAsync(nullptr, std::move(done)); + } +}; + +template +struct ConversionSubExprEvalAsync { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { + impl.evalSubExprsIfNeededAsync(data, std::move(done)); + } +}; +#endif + +namespace internal { + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, + Index index) { + internal::scalar_cast_op converter; + return converter(impl.coeff(index)); + } +}; + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, + Index index) { + return impl.coeff(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + static constexpr int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, + Index index) { + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index + i)); + } + TargetPacket rslt = internal::pload(values); + return rslt; + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, + Index index) { + const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + PacketConverter, SrcPacket, TargetPacket, SrcCoeffRatio, TgtCoeffRatio> converter( + impl); + return converter.template packet(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type TargetType; + static constexpr int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, + Index index) { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index + i); + return internal::pload(values); + } +}; + +template +struct PacketConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, + Index index) { + return impl.template packet(index); + } +}; + +} // namespace internal + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorConversionOp XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef TargetType Scalar; + typedef TargetType CoeffReturnType; + typedef internal::remove_all_t::Scalar> SrcType; + typedef typename PacketType::type PacketReturnType; + typedef typename PacketType::type PacketSourceType; + static constexpr int PacketSize = PacketType::size; + static constexpr bool IsSameType = internal::is_same::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = +#ifndef EIGEN_USE_SYCL + true, +#else + TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast, +#endif + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + RawAccess = false + }; + + static constexpr int Layout = TensorEvaluator::Layout; + static constexpr int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + struct TensorConversionOpBlockFactory { + template + struct XprType { + typedef TensorConversionOp type; + }; + + template + typename XprType::type expr(const ArgXprType& expr) const { + return typename XprType::type(expr); + } + }; + + typedef internal::TensorUnaryExprBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + return ConversionSubExprEval, EvaluatorPointerType>::run(m_impl, data); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) { + ConversionSubExprEvalAsync, EvaluatorPointerType, + EvalSubExprsCallback>::run(m_impl, data, std::move(done)); + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return internal::CoeffConv::run(m_impl, index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + // If we are not going to do the cast, we just need to check that base + // TensorEvaluator has packet access. Otherwise we also need to make sure, + // that we have an implementation of vectorized cast. + const bool Vectorizable = IsSameType ? TensorEvaluator::PacketAccess + : int(TensorEvaluator::PacketAccess) & + int(internal::type_casting_traits::VectorizedCast); + + return internal::PacketConv::run(m_impl, + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double cast_cost = TensorOpCost::CastCost(); + if (vectorized) { + const double SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const double TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + + TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); + } else { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return m_impl.getResourceRequirements(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_impl.block(desc, scratch), TensorConversionOpBlockFactory()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h new file mode 100644 index 00000000000..26984b69090 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -0,0 +1,1123 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + +template +class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array& kernel_dims, + const array& indices) { + array dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array inputStrides; + array outputStrides; + if (static_cast(Layout) == static_cast(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; + outputStrides[i] = outputStrides[i - 1] * dimensions[i - 1]; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; + } + } + + array gpuInputDimensions; + array gpuOutputDimensions; + array tmp = dimensions; + array ordering; + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = i + offset; + ordering[index] = indices[i]; + tmp[indices[i]] = -1; + gpuInputDimensions[index] = input_dims[indices[i]]; + gpuOutputDimensions[index] = dimensions[indices[i]]; + } + + int written = static_cast(Layout) == static_cast(ColMajor) ? NumKernelDims : 0; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + gpuInputDimensions[written] = input_dims[i]; + gpuOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_gpuInputStrides[i] = m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1]; + m_gpuOutputStrides[i] = m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1]; + } else { + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (static_cast(i + 1) < offset) { + m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; + m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; + } else { + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; + } + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_gpuInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_gpuInputStrides[d]; + } + if (NumKernelDims < NumDims) { + inputIndex += p * m_inputStrides[NumKernelDims]; + } + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_gpuInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_gpuInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; + } + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_gpuOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; + } + if (NumKernelDims < NumDims) { + outputIndex += p * m_outputStrides[NumKernelDims]; + } + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_gpuOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; + } + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + k * m_inputStrides[offset + 2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + k * m_outputStrides[offset + 2]; + } + + private: + static constexpr int NumDims = internal::array_size::value; + array m_inputStrides; + array m_outputStrides; + array m_gpuInputStrides; + array m_gpuOutputStrides; +}; + +template +struct traits > { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, typename traits::Index>::type + Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = traits::Layout; + typedef std::conditional_t::val, + typename traits::PointerType, typename traits::PointerType> + PointerType; + + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorConvolutionOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorConvolutionOp type; +}; + +} // end namespace internal + +template +class TensorConvolutionOp + : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, + const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t& inputExpression() + const { + return m_input_xpr; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t& kernelExpression() + const { + return m_kernel_xpr; + } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + +template +struct TensorEvaluator, Device> { + typedef TensorConvolutionOp XprType; + + static constexpr int NumDims = + internal::array_size::Dimensions>::value; + static constexpr int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = + int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), + m_kernelImpl(op.kernelExpression(), device), + m_kernelArg(op.kernelExpression()), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + CoeffReturnType result = CoeffReturnType(0); + convolve(firstInput(index), 0, NumKernelDims - 1, result); + return result; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { + Index indices[2] = {index, index + PacketSize - 1}; + Index startInputs[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1] - startInputs[0] == PacketSize - 1) { + PacketReturnType result = internal::pset1(0); + convolvePacket(startInputs[0], 0, NumKernelDims - 1, result); + return result; + } else { + EIGEN_ALIGN_MAX Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims - 1, data[0]); + for (int i = 1; i < PacketSize - 1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index + i), 0, NumKernelDims - 1, data[i]); + } + data[PacketSize - 1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims - 1, data[PacketSize - 1]); + return internal::pload(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex - 1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex - 1, accum); + } else { + accum = internal::pmadd(m_inputImpl.template packet(input), + internal::pset1(m_kernel[kernel]), accum); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool Vectorize = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + array m_inputStride; + array m_outputStride; + + array m_indexStride; + array m_kernelStride; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device EIGEN_DEVICE_REF m_device; +}; + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + +template +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int /*kernelSize*/) const { return StaticKernelSize; } +}; +template <> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int kernelSize) const { return kernelSize; } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D( + InputEvaluator eval, const internal::IndexMapper indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, + float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else + extern __shared__ float s[]; +#endif + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = blockIdx.y * blockDim.y; + const int plane_stride = blockDim.y * gridDim.y; + + for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.y * num_x_input; +#pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + +#pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; +#pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D( + InputEvaluator eval, const internal::IndexMapper indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, + const int kernelSizeX, const int kernelSizeY, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else + extern __shared__ float s[]; +#endif + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = blockIdx.z * blockDim.z; + const int plane_stride = blockDim.z * gridDim.z; + + for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.z * num_y_input; + +// Load inputs to shared memory +#pragma unroll + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); +#pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = + plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + +#pragma unroll + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { +#pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; +#pragma unroll + for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); +#pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = + plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( + InputEvaluator eval, const internal::IndexMapper indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, + const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else + extern __shared__ float s[]; +#endif + + // Load inputs to shared memory + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + first_x, j + first_y, k + first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * + kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset( + i + first_x, j + first_y, k + first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + +template +struct TensorEvaluator, GpuDevice> { + typedef TensorConvolutionOp XprType; + + static constexpr int NumDims = + internal::array_size::Dimensions>::value; + static constexpr int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = + TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), + m_kernelImpl(op.kernelExpression(), device), + m_kernelArg(op.kernelExpression()), + m_indices(op.indices()), + m_buf(NULL), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static constexpr int PacketSize = internal::unpacket_traits::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator::Dimensions InputDims; + + const int maxSharedMem = m_device.sharedMemPerBlock(); + const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumGpuMultiProcessors(); + const int warpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + int maxX; + dim3 block_size; + + const int single_stride_dim = + static_cast(Layout) == static_cast(ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = numext::mini(inner_dim, numX); + const int maxP = numext::mini(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = numext::mini(maxThreadsPerBlock, maxX); + block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); + } else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); + const int maxP = numext::mini(inner_dim, numP); + maxX = numext::mini(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = numext::mini(warpSize, maxX); + block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + gpu_assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, numext::mini(num_y_blocks, ceil(numP, block_size.y))); + + // cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " + // num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " + // << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices{m_indices[0]}; + const array kernel_dims{m_kernelImpl.dimensions()[0]}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size) { + case 4: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, + numX, maxX, 4, data); + break; + } + case 7: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, + numX, maxX, 7, data); + break; + } + default: { + LAUNCH_GPU_KERNEL( + (EigenConvolutionKernel1D, Index, InputDims, Dynamic>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, + kernel_size, data); + } + } + break; + } + + case 2: { + const int idxX = static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; + const int idxY = static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numP = dimensions().TotalSize() / (numX * numY); + + const float scaling_factor = + sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = numext::mini(inner_dim, numX); + const int maxY = + numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = numext::mini( + maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = numext::mini(1024, maxX); + block_size.y = numext::mini(1024 / block_size.x, maxY); + block_size.z = numext::mini(1024 / (block_size.x * block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + gpu_assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini(num_z_blocks, ceil(numP, block_size.z))); + + // cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " + // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << + // " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " + // shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices{m_indices[idxX], m_indices[idxY]}; + const array kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + LAUNCH_GPU_KERNEL( + (EigenConvolutionKernel2D, Index, InputDims, 4, 7>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, + numY, maxY, 4, 7, data); + break; + } + default: { + LAUNCH_GPU_KERNEL( + (EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, + numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + LAUNCH_GPU_KERNEL( + (EigenConvolutionKernel2D, Index, InputDims, 7, 4>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, + numY, maxY, 7, 4, data); + break; + } + default: { + LAUNCH_GPU_KERNEL( + (EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, + numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, + Dynamic, Dynamic>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, + numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + break; + } + + case 3: { + const int idxX = static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; + const int idxY = static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; + const int idxZ = static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; + const int numP = dimensions().TotalSize() / (numX * numY * numZ); + + const int maxX = numext::mini( + 128, numext::mini(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, + numX)); + const int maxY = numext::mini( + 128, numext::mini( + maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, + numY)); + const int maxZ = numext::mini( + 128, numext::mini( + maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - + kernel_size_z + 1, + numZ)); + + dim3 block_size; + block_size.x = numext::mini(32, maxX); + block_size.y = numext::mini(32, maxY); + block_size.z = numext::mini(1024 / (block_size.x * block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = + (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + gpu_assert(shared_mem <= maxSharedMem); + + // cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " + // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << + // " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << + // endl; + const array indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}; + const array kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), + num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, + maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + + private: + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h new file mode 100644 index 00000000000..88ae9f4bb5b --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -0,0 +1,546 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ + +enum class convolution_type { CONV1D, CONV2D, CONV3D }; +template +struct EigenConvolutionKernel; +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const size_t kernelSize; + const cl::sycl::range<2> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const size_t kernelSize_, const cl::sycl::range<2> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernelSize(kernelSize_), + input_range(input_range_) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const { + return (boolean_check[0] && boolean_check[1]); + } + void operator()(cl::sycl::nd_item<2> itemID) const { + auto buffer_ptr = buffer_acc; + auto kernel_ptr = kernel_filter; + // the required row to be calculated for the for each plane in shered memory + const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); + const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; + const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0]; + const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1)); + /// fill the shared memory + for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) { + const size_t local_index = i + plane_kernel_offset; + const size_t tensor_index = + plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset); + + local_acc[local_index] = + (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1]) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution // output start x + const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]); + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + const size_t index = plane_kernel_offset + itemID.get_local_id(0); + for (size_t k = 0; k < kernelSize; ++k) { + result += (local_acc[k + index] * kernel_ptr[k]); + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<2> kernel_size; + const cl::sycl::range<3> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + + void operator()(cl::sycl::nd_item<3> itemID) const { + auto buffer_ptr = buffer_acc; + auto kernel_ptr = kernel_filter; + // the required row to be calculated for the for each plane in shered memory + const auto num_input = cl::sycl::range<2>{ + (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; + + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2)); + const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1]; + + const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + // fill the local memory + bool in_range_dim2 = itemID.get_global_id(2) < input_range[2]; + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset); + bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + const size_t local_index = i + local_input_offset; + const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1]); + local_acc[local_index] = + (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && in_range_dim1 && in_range_dim2) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // output offset start for each thread + const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + + for (size_t j = 0; j < kernel_size[1]; j++) { + size_t kernel_offset = kernel_size[0] * j; + const size_t index = + (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0); + for (size_t i = 0; i < kernel_size[0]; i++) { + result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]); + } + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0], + itemID.get_local_id(1) + output_offset[1]); + + buffer_ptr[tensor_index] = result; + } + } +}; + +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<3> kernel_size; + const cl::sycl::range<3> input_range; + const size_t numP; + + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_, + const size_t numP_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_), + numP(numP_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + void operator()(cl::sycl::nd_item<3> itemID) const { + auto buffer_ptr = buffer_acc; + auto kernel_ptr = kernel_filter; + const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; + + const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; + + const auto output_offset = + cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()}; + + for (size_t p = 0; p < numP; p++) { + /// fill the shared memory + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) { + size_t local_index_dim2 = num_input[0] * num_input[1] * k; + bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1)); + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1)); + size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2; + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1)); + const size_t local_index = local_index_dim1 + i; + const size_t tensor_index = + plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1], k + input_offset[2]); + local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0); + } + } + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + for (size_t k = 0; k < kernel_size[2]; k++) { + for (size_t j = 0; j < kernel_size[1]; j++) { + for (size_t i = 0; i < kernel_size[0]; i++) { + const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k); + const size_t local_index = + ((i + itemID.get_local_id(0)) + + num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2)))); + + result += (local_acc[local_index] * kernel_ptr[kernel_index]); + } + } + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]); + buffer_ptr[tensor_index] = result; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + } +}; + +template +struct TensorEvaluator, Eigen::SyclDevice> { + typedef TensorConvolutionOp XprType; + + static constexpr int NumDims = + internal::array_size::Dimensions>::value; + static constexpr int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + typedef const Eigen::SyclDevice Device; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory KernelStorage; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device) + : m_inputImpl(op.inputExpression(), device), + m_kernelArg(op.kernelExpression()), + m_kernelImpl(op.kernelExpression(), device), + m_indices(op.indices()), + m_buf(NULL), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions &input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions &kernel_dims = + m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (EvaluatorPointerType)m_device.get( + (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate_temp(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate_temp(m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + typename KernelStorage::Type in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz)); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(m_device.get(local), m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + m_kernel = local; + m_local_kernel = true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const { + typedef TensorEvaluator InputEvaluator; + typedef typename InputEvaluator::Dimensions InputDims; + switch (NumKernelDims) { + case 1: { + const size_t numX = dimensions()[m_indices[0]]; + const size_t numP = dimensions().TotalSize() / numX; + const auto input_dim = std::array{numX, numP}; + auto global_range = cl::sycl::range<2>{1, 1}; + auto local_range = cl::sycl::range<2>{1, 1}; + const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); + + m_device.parallel_for_setup(input_dim, global_range, local_range); + const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]); + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[0]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + typedef EigenConvolutionKernel + ConvKernel; + + m_device + .template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])) + .wait(); + break; + } + + case 2: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0}; + auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]]}; + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numP = dimensions().TotalSize() / (numX * numY); + auto input_dim = std::array{numX, numY, numP}; + + auto global_range = cl::sycl::range<3>{1, 1, 1}; + auto local_range = cl::sycl::range<3>{1, 1, 1}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + + const size_t local_memory_size = + (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2]; + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}}; + const array kernel_dims{ + {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + typedef EigenConvolutionKernel + ConvKernel; + m_device + .template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}) + .wait(); + break; + } + + case 3: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 2, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0}; + + auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]], + (size_t)m_kernelImpl.dimensions()[kernel_index[2]]}; + + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numZ = dimensions()[m_indices[kernel_index[2]]]; + auto input_dim = std::array{numX, numY, numZ}; + const size_t numP = dimensions().TotalSize() / (numX * numY * numZ); + + const array indices{ + {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]], + m_kernelImpl.dimensions()[kernel_index[1]], + m_kernelImpl.dimensions()[kernel_index[2]]}}; + + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + auto global_range = cl::sycl::range<3>{1, 1, 1}; + auto local_range = cl::sycl::range<3>{1, 1, 1}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + auto local_memory_range = (local_range + kernel_size - 1); + const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2]; + + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + typedef EigenConvolutionKernel + ConvKernel; + m_device + .template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP) + .wait(); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_buf != NULL); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { + eigen_assert(m_buf != NULL); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator &operator=(const TensorEvaluator &); + TensorEvaluator m_inputImpl; + KernelArgType m_kernelArg; + TensorEvaluator m_kernelImpl; + Indices m_indices; + Dimensions m_dimensions; + EvaluatorPointerType m_buf; + typename KernelStorage::Type m_kernel; + bool m_local_kernel; + const Eigen::SyclDevice EIGEN_DEVICE_REF m_device; +}; // namespace Eigen + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h new file mode 100644 index 00000000000..c0f13337a60 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -0,0 +1,190 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H +#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief A cost model used to limit the number of threads used for evaluating + * tensor expression. + * + */ + +// Class storing the cost of evaluating a tensor expression in terms of the +// estimated number of operand bytes loads, bytes stored, and compute cycles. +class TensorOpCost { + public: + // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple + // model based on minimal reciprocal throughput numbers from Intel or + // Agner Fog's tables would be better than what is there now. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { + return internal::functor_traits >::Cost; + } + + EIGEN_DEVICE_FUNC TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} + EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) + : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(compute_cycles) {} + + EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, + double packet_size) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) { + eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { return bytes_loaded_; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { return bytes_stored_; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { return compute_cycles_; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost, + double compute_cost) const { + return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_; + } + + // Drop memory access component. Intended for cases when memory accesses are + // sequential or are completely masked by computations. + EIGEN_DEVICE_FUNC void dropMemoryCost() { + bytes_loaded_ = 0; + bytes_stored_ = 0; + } + + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost& rhs) const { + double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); + } + + // TODO(rmlarsen): Define max in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost& rhs) const { + double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(const TensorOpCost& rhs) { + bytes_loaded_ += rhs.bytes_loaded(); + bytes_stored_ += rhs.bytes_stored(); + compute_cycles_ += rhs.compute_cycles(); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { + bytes_loaded_ *= rhs; + bytes_stored_ *= rhs; + compute_cycles_ *= rhs; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost& rhs) { + lhs += rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs) { + lhs *= rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs) { + rhs *= lhs; + return rhs; + } + + friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { + return os << "[bytes_loaded = " << tc.bytes_loaded() << ", bytes_stored = " << tc.bytes_stored() + << ", compute_cycles = " << tc.compute_cycles() << "]"; + } + + private: + double bytes_loaded_; + double bytes_stored_; + double compute_cycles_; +}; + +// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads +// in [1:max_threads] instead of just switching multi-threading off for small +// work units. +template +class TensorCostModel { + public: + // Scaling from Eigen compute cost to device cycles. + static const int kDeviceCyclesPerComputeCycle = 1; + + // Costs in device cycles. + static const int kStartupCycles = 100000; + static const int kPerThreadCycles = 100000; + static const int kTaskSize = 40000; + + // Returns the number of threads in [1:max_threads] to use for + // evaluating an expression with the given output size and cost per + // coefficient. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost& cost_per_coeff, + int max_threads) { + double cost = totalCost(output_size, cost_per_coeff); + double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + // Make sure we don't invoke undefined behavior when we convert to an int. + threads = numext::mini(threads, GenericNumTraits::highest()); + return numext::mini(max_threads, numext::maxi(1, static_cast(threads))); + } + + // taskSize assesses parallel task size. + // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task + // granularity needs to be increased to mitigate parallelization overheads. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost& cost_per_coeff) { + return totalCost(output_size, cost_per_coeff) / kTaskSize; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size, + const TensorOpCost& cost_per_coeff) { + // Cost of memory fetches from L2 cache. 64 is typical cache line size. + // 11 is L2 cache latency on Haswell. + // We don't know whether data is in L1, L2 or L3. But we are most interested + // in single-threaded computational time around 100us-10ms (smaller time + // is too small for parallelization, larger time is not interesting + // either because we are probably using all available threads already). + // And for the target time range, L2 seems to be what matters. Data set + // fitting into L1 is too small to take noticeable time. Data set fitting + // only into L3 presumably will take more than 10ms to load and process. + const double kLoadCycles = 1.0 / 64 * 11; + const double kStoreCycles = 1.0 / 64 * 11; + // Scaling from Eigen compute cost to device cycles. + return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle); + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h new file mode 100644 index 00000000000..c8e1df73394 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -0,0 +1,309 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H +#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorCustomUnaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template +struct traits > { + typedef typename XprType::Scalar Scalar; + typedef typename XprType::StorageKind StorageKind; + typedef typename XprType::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = traits::Layout; + typedef typename traits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorCustomUnaryOp EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef TensorCustomUnaryOp type; +}; + +} // end namespace internal + +template +class TensorCustomUnaryOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func) + : m_expr(expr), m_func(func) {} + + EIGEN_DEVICE_FUNC const CustomUnaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_expr; } + + protected: + typename XprType::Nested m_expr; + const CustomUnaryFunc m_func; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorCustomUnaryOp ArgType; + typedef typename internal::traits::Index Index; + static constexpr int NumDims = internal::traits::NumDimensions; + typedef DSizes Dimensions; + typedef std::remove_const_t Scalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) { + m_dimensions = op.func().dimensions(op.expression()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast( + m_device.get((CoeffReturnType*)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)))); + evalTo(m_result); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + + protected: + void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); + m_op.func().eval(m_op.expression(), result, m_device); + } + + Dimensions m_dimensions; + const ArgType m_op; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; +}; + +/** \class TensorCustomBinaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template +struct traits > { + typedef typename internal::promote_storage_type::ret Scalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = traits::Layout; + typedef std::conditional_t::val, + typename traits::PointerType, typename traits::PointerType> + PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorCustomBinaryOp& type; +}; + +template +struct nested > { + typedef TensorCustomBinaryOp type; +}; + +} // end namespace internal + +template +class TensorCustomBinaryOp + : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::traits::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, + const CustomBinaryFunc& func) + + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {} + + EIGEN_DEVICE_FUNC const CustomBinaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& lhsExpression() const { + return m_lhs_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& rhsExpression() const { + return m_rhs_xpr; + } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const CustomBinaryFunc m_func; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorCustomBinaryOp XprType; + typedef typename internal::traits::Index Index; + static constexpr int NumDims = internal::traits::NumDimensions; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) { + m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast( + m_device.get((CoeffReturnType*)m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)))); + evalTo(m_result); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_result != NULL) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + + protected: + void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); + m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); + } + + Dimensions m_dimensions; + const XprType m_op; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h new file mode 100644 index 00000000000..9898fdcfbb5 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -0,0 +1,139 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its argument + * on the specified computing 'device' (GPU, thread pool, ...) + * + * Example: + * C.device(EIGEN_GPU) = A + B; + * + * Todo: operator *= and /=. + */ + +template +class TensorDevice { + public: + TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice) + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + template + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> + Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; +}; + +/** \class TensorAsyncDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its + * argument asynchronously on the specified device. Currently only + * ThreadPoolDevice implements proper asynchronous execution, while the default + * and GPU devices just run the expression synchronously and call m_done() on + * completion.. + * + * Example: + * auto done = []() { ... expression evaluation done ... }; + * C.device(thread_pool_device, std::move(done)) = A + B; + */ + +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const DeviceType& device, ExpressionType& expression, DoneCallback done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorExecutor Executor; + + Assign assign(m_expression, other); + Executor::run(assign, m_device); + m_done(); + + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; + DoneCallback m_done; +}; + +#ifdef EIGEN_USE_THREADS +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression, DoneCallback done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorAsyncExecutor Executor; + + // WARNING: After assignment 'm_done' callback will be in undefined state. + Assign assign(m_expression, other); + Executor::runAsync(assign, m_device, std::move(m_done)); + + return *this; + } + + protected: + const ThreadPoolDevice& m_device; + ExpressionType& m_expression; + DoneCallback m_done; +}; +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h new file mode 100644 index 00000000000..c2c8ed002bf --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -0,0 +1,7 @@ + +#if defined(__clang__) || defined(__GNUC__) +#warning \ + "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file" +#endif + +#include "TensorDeviceGpu.h" diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h new file mode 100644 index 00000000000..eaaf3321500 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -0,0 +1,113 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +// Default device for the machine (typically a single cpu core) +struct DefaultDevice { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return allocate(num_bytes); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { +#ifdef EIGEN_GPU_COMPILE_PHASE + // std::fill is not a device function, so resort to simple loop. + for (T* it = begin; it != end; ++it) { + *it = value; + } +#else + std::fill(begin, end, value); +#endif + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) + // Running on the host CPU + return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 64; +#else + // Running on a CUDA device + return 32; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) + // Running on the host CPU + return l1CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 48 * 1024; // FIXME : update this number for HIP +#else + // Running on a CUDA device, return the amount of shared memory available. + return 48 * 1024; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) + // Running single threaded on the host CPU + return l3CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return firstLevelCacheSize(); // FIXME : update this number for HIP +#else + // Running on a CUDA device + return firstLevelCacheSize(); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { + // Nothing. Default device operations are synchronous. + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) + // Running single threaded on the host CPU + // Should return an enum that encodes the ISA supported by the CPU + return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + // return 1 as major for HIP + return 1; +#else + // Running on a CUDA device + return EIGEN_CUDA_ARCH / 100; +#endif + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h new file mode 100644 index 00000000000..4c24bc1f0a9 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -0,0 +1,395 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H + +// This header file container defines fo gpu* macros which will resolve to +// their equivalent hip* or cuda* versions depending on the compiler in use +// A separate header (included at the end of this file) will undefine all +#include "TensorGpuHipCudaDefines.h" + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +static const int kGpuScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// HIP / CUDA streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const gpuStream_t& stream() const = 0; + virtual const gpuDeviceProp_t& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +class GpuDeviceProperties { + public: + GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {} + + ~GpuDeviceProperties() { + if (device_properties_) { + delete[] device_properties_; + } + } + + EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { return device_properties_[device]; } + + EIGEN_STRONG_INLINE bool isInitialized() const { return initialized_; } + + void initialize() { + if (!initialized_) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. + if (first_.exchange(false)) { + // We're the first thread to reach this point. + int num_devices; + gpuError_t status = gpuGetDeviceCount(&num_devices); + if (status != gpuSuccess) { + std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl; + gpu_assert(status == gpuSuccess); + } + device_properties_ = new gpuDeviceProp_t[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = gpuGetDeviceProperties(&device_properties_[i], i); + if (status != gpuSuccess) { + std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl; + gpu_assert(status == gpuSuccess); + } + } + + std::atomic_thread_fence(std::memory_order_release); + initialized_ = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!initialized_) { + std::atomic_thread_fence(std::memory_order_acquire); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + } + } + } + + private: + volatile bool initialized_; + std::atomic first_; + gpuDeviceProp_t* device_properties_; +}; + +EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { + static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties(); + if (!deviceProperties->isInitialized()) { + deviceProperties->initialize(); + } + return *deviceProperties; +} + +EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) { + return GetGpuDeviceProperties().get(device); +} + +static const gpuStream_t default_stream = gpuStreamDefault; + +class GpuStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + gpuError_t status = gpuGetDevice(&device_); + if (status != gpuSuccess) { + std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl; + gpu_assert(status == gpuSuccess); + } + } + // Use the default stream on the specified device + GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {} + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + GpuStreamDevice(const gpuStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + gpuError_t status = gpuGetDevice(&device_); + if (status != gpuSuccess) { + std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl; + gpu_assert(status == gpuSuccess); + } + } else { + int num_devices; + gpuError_t err = gpuGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + gpu_assert(device < num_devices); + device_ = device; + } + } + + virtual ~GpuStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const gpuStream_t& stream() const { return *stream_; } + const gpuDeviceProp_t& deviceProperties() const { return GetGpuDeviceProperties(device_); } + virtual void* allocate(size_t num_bytes) const { + gpuError_t err = gpuSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + void* result; + err = gpuMalloc(&result, num_bytes); + gpu_assert(err == gpuSuccess); + gpu_assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + gpuError_t err = gpuSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + gpu_assert(buffer != NULL); + err = gpuFree(buffer); + gpu_assert(err == gpuSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kGpuScratchSize; + semaphore_ = reinterpret_cast(scratch); + gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + return semaphore_; + } + + private: + const gpuStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { eigen_assert(stream); } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const gpuStream_t& stream() const { return stream_->stream(); } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return stream_->allocate(num_bytes); } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { stream_->deallocate(buffer); } + + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return stream_->allocate(num_bytes); } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { stream_->deallocate(buffer); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { return stream_->semaphore(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n); + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(buffer) + EIGEN_UNUSED_VARIABLE(c) + EIGEN_UNUSED_VARIABLE(n) + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + template + EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + const size_t count = end - begin; + // Split value into bytes and run memset with stride. + const int value_size = sizeof(value); + char* buffer = (char*)begin; + char* value_bytes = (char*)(&value); + gpuError_t err; + EIGEN_UNUSED_VARIABLE(err) + + // If all value bytes are equal, then a single memset can be much faster. + bool use_single_memset = true; + for (int i = 1; i < value_size; ++i) { + if (value_bytes[i] != value_bytes[0]) { + use_single_memset = false; + } + } + + if (use_single_memset) { + err = gpuMemsetAsync(buffer, value_bytes[0], count * sizeof(T), stream_->stream()); + gpu_assert(err == gpuSuccess); + } else { + for (int b = 0; b < value_size; ++b) { + err = gpuMemset2DAsync(buffer + b, value_size, value_bytes[b], 1, count, stream_->stream()); + gpu_assert(err == gpuSuccess); + } + } +#else + EIGEN_UNUSED_VARIABLE(begin) + EIGEN_UNUSED_VARIABLE(end) + EIGEN_UNUSED_VARIABLE(value) + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48 * 1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on hip/cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuStreamSynchronize(stream_->stream()); + if (err != gpuSuccess) { + std::cerr << "Error detected in GPU stream: " << gpuGetErrorString(err) << std::endl; + gpu_assert(err == gpuSuccess); + } +#else + gpu_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { return stream_->deviceProperties().multiProcessorCount; } + EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { return stream_->deviceProperties().maxThreadsPerBlock; } + EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return static_cast(stream_->deviceProperties().sharedMemPerBlock); + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { return stream_->deviceProperties().minor; } + + EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; } + + // This function checks if the GPU runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#ifdef EIGEN_GPUCC + gpuError_t error = gpuStreamQuery(stream_->stream()); + return (error == gpuSuccess) || (error == gpuErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#if defined(EIGEN_HIPCC) + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \ + gpu_assert(hipGetLastError() == hipSuccess); + +#else + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>(__VA_ARGS__); \ + gpu_assert(cudaGetLastError() == cudaSuccess); + +#endif + +// FIXME: Should be device and kernel specific. +#ifdef EIGEN_GPUCC +static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t status = gpuDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + gpu_assert(status == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} +#endif + +} // end namespace Eigen + +// undefine all the gpu* macros we defined at the beginning of the file +#include "TensorGpuHipCudaUndefines.h" + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h new file mode 100644 index 00000000000..b291fed66a5 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -0,0 +1,567 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +#include + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace TensorSycl { +namespace internal { + +/// Cache all the device information needed +struct SyclDeviceInfo { + SyclDeviceInfo(cl::sycl::queue queue) + : local_mem_type(queue.get_device().template get_info()), + max_work_item_sizes(queue.get_device().template get_info>()), + max_mem_alloc_size(queue.get_device().template get_info()), + max_compute_units(queue.get_device().template get_info()), + max_work_group_size(queue.get_device().template get_info()), + local_mem_size(queue.get_device().template get_info()), + platform_name(queue.get_device().get_platform().template get_info()), + device_name(queue.get_device().template get_info()), + device_vendor(queue.get_device().template get_info()) {} + + cl::sycl::info::local_mem_type local_mem_type; + cl::sycl::id<3> max_work_item_sizes; + unsigned long max_mem_alloc_size; + unsigned long max_compute_units; + unsigned long max_work_group_size; + size_t local_mem_size; + std::string platform_name; + std::string device_name; + std::string device_vendor; +}; + +} // end namespace internal +} // end namespace TensorSycl + +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. +EIGEN_STRONG_INLINE auto get_sycl_supported_devices() -> decltype(cl::sycl::device::get_devices()) { +#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR + return {cl::sycl::device(cl::sycl::default_selector())}; +#else + std::vector supported_devices; + auto platform_list = cl::sycl::platform::get_platforms(); + for (const auto &platform : platform_list) { + auto device_list = platform.get_devices(); + auto platform_name = platform.template get_info(); + std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower); + for (const auto &device : device_list) { + auto vendor = device.template get_info(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + bool unsupported_condition = (device.is_cpu() && platform_name.find("amd") != std::string::npos && + vendor.find("apu") == std::string::npos) || + (platform_name.find("experimental") != std::string::npos) || device.is_host(); + if (!unsupported_condition) { + supported_devices.push_back(device); + } + } + } + return supported_devices; +#endif +} + +class QueueInterface { + public: + /// Creating device by using cl::sycl::selector or cl::sycl::device. + template + explicit QueueInterface(const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, + unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}}, + m_thread_pool(num_threads), + m_device_info(m_queue) {} + + template + explicit QueueInterface(const DeviceOrSelector &dev_or_sel, + unsigned num_threads = std::thread::hardware_concurrency()) + : QueueInterface( + dev_or_sel, [this](cl::sycl::exception_list l) { this->exception_caught_ = this->sycl_async_handler(l); }, + num_threads) {} + + explicit QueueInterface(const cl::sycl::queue &q, unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue(q), m_thread_pool(num_threads), m_device_info(m_queue) {} + + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + return (void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES, num_bytes, m_queue); +#else + return (void *)cl::sycl::malloc_device(num_bytes, m_queue); +#endif + } + + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { + return (void *)cl::sycl::malloc_device(num_bytes, m_queue); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const { + return data; + } + + EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { deallocate(p); } + + EIGEN_STRONG_INLINE void deallocate_temp(const void *p) const { deallocate_temp(const_cast(p)); } + + EIGEN_STRONG_INLINE void deallocate(void *p) const { cl::sycl::free(p, m_queue); } + + /// The memcpyHostToDevice is used to copy the data from host to device + /// The destination pointer could be deleted before the copy happened which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyHostToDevice(void *dst, const void *src, size_t n, + std::function callback) const { + auto e = m_queue.memcpy(dst, src, n); + synchronize_and_callback(e, callback); + } + + /// The memcpyDeviceToHost is used to copy the data from device to host. + /// The source pointer could be deleted before the copy happened which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const void *src, size_t n, + std::function callback) const { + if (n == 0) { + if (callback) callback(); + return; + } + auto e = m_queue.memcpy(dst, src, n); + synchronize_and_callback(e, callback); + } + + /// The memcpy function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + if (n == 0) { + return; + } + m_queue.memcpy(dst, src, n).wait(); + } + + /// the memset function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + if (n == 0) { + return; + } + m_queue.memset(data, c, n).wait(); + } + + template + EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const { + if (begin == end) { + return; + } + const size_t count = end - begin; + m_queue.fill(begin, value, count).wait(); + } + + template + EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(const Lhs &lhs, const Rhs &rhs, OutPtr outptr, + Range thread_range, Index scratchSize, T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for(thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); + }; + + return m_queue.submit(kernel_functor); + } + + template + EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(const InPtr &inptr, OutPtr &outptr, Range thread_range, + Index scratchSize, T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, outptr, var...)); + }; + return m_queue.submit(kernel_functor); + } + + template + EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(const InPtr &inptr, Range thread_range, Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...)); + }; + + return m_queue.submit(kernel_functor); + } + + EIGEN_STRONG_INLINE void synchronize() const { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + tileSize = static_cast(getNearestPowerOfTwoWorkGroupSize()); + tileSize = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(tileSize)); + rng = n; + if (rng == 0) rng = static_cast(1); + GRange = rng; + if (tileSize > GRange) + tileSize = GRange; + else if (GRange > tileSize) { + Index xMode = static_cast(GRange % tileSize); + if (xMode != 0) GRange += static_cast(tileSize - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + std::array input_range = input_dim; + Index max_workgroup_Size = static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + local_range[1] = static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) global_range[1] += static_cast(local_range[1] - xMode); + } + local_range[0] = static_cast(max_workgroup_Size / local_range[1]); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) global_range[0] += static_cast(local_range[0] - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + std::array input_range = input_dim; + Index max_workgroup_Size = static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + local_range[2] = static_cast(std::pow(2, static_cast(pow_of_2 / 3))); + input_range[2] = input_dim[2]; + if (input_range[2] == 0) input_range[1] = static_cast(1); + global_range[2] = input_range[2]; + if (local_range[2] > global_range[2]) + local_range[2] = global_range[2]; + else if (global_range[2] > local_range[2]) { + Index xMode = static_cast(global_range[2] % local_range[2]); + if (xMode != 0) global_range[2] += static_cast(local_range[2] - xMode); + } + pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size / local_range[2]))); + local_range[1] = static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) global_range[1] += static_cast(local_range[1] - xMode); + } + local_range[0] = static_cast(max_workgroup_Size / (local_range[1] * local_range[2])); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) global_range[0] += static_cast(local_range[0] - xMode); + } + } + + EIGEN_STRONG_INLINE bool has_local_memory() const { +#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + return false; +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + return true; +#else + return m_device_info.local_mem_type == cl::sycl::info::local_mem_type::local; +#endif + } + + EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { return m_device_info.max_mem_alloc_size; } + + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { return m_device_info.max_compute_units; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { return m_device_info.max_work_group_size; } + + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { return m_device_info.max_work_item_sizes; } + + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL does not have such a concept + return 2; + } + + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return m_device_info.local_mem_size; } + + // This function returns the nearest power of 2 Work-group size which is <= + // maximum device workgroup size. + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return getPowerOfTwo(m_device_info.max_work_group_size, false); + } + + EIGEN_STRONG_INLINE std::string getPlatformName() const { return m_device_info.platform_name; } + + EIGEN_STRONG_INLINE std::string getDeviceName() const { return m_device_info.device_name; } + + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return m_device_info.device_vendor; } + + // This function returns the nearest power of 2 + // if roundup is true returns result>=wgsize + // else it return result <= wgsize + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { + if (roundUp) --wGSize; + wGSize |= (wGSize >> 1); + wGSize |= (wGSize >> 2); + wGSize |= (wGSize >> 4); + wGSize |= (wGSize >> 8); + wGSize |= (wGSize >> 16); +#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 + wGSize |= (wGSize >> 32); +#endif + return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); + } + + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { + if (!exception_caught_) { + synchronize(); + } + return !exception_caught_; + } + + protected: + void synchronize_and_callback(cl::sycl::event e, const std::function &callback) const { + if (callback) { + auto callback_ = [=]() { +#ifdef EIGEN_EXCEPTIONS + cl::sycl::event(e).wait_and_throw(); +#else + cl::sycl::event(e).wait(); +#endif + callback(); + }; + m_thread_pool.Schedule(std::move(callback_)); + } else { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + } + + bool sycl_async_handler(cl::sycl::exception_list exceptions) const { + bool exception_caught = false; + for (const auto &e : exceptions) { + if (e) { + exception_caught = true; + EIGEN_THROW_X(e); + } + } + return exception_caught; + } + + /// class members: + bool exception_caught_ = false; + /// sycl queue + mutable cl::sycl::queue m_queue; + /// The thread pool is used to wait on events and call callbacks + /// asynchronously + mutable Eigen::ThreadPool m_thread_pool; + + const TensorSycl::internal::SyclDeviceInfo m_device_info; +}; + +struct SyclDeviceBase { + /// QueueInterface is not owned. it is the caller's responsibility to destroy + /// it + const QueueInterface *m_queue_stream; + explicit SyclDeviceBase(const QueueInterface *queue_stream) : m_queue_stream(queue_stream) {} + EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { return m_queue_stream; } +}; + +// Here is a sycl device struct which accept the sycl queue interface +// as an input +struct SyclDevice : public SyclDeviceBase { + explicit SyclDevice(const QueueInterface *queue_stream) : SyclDeviceBase(queue_stream) {} + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); + } + + /// allocate device memory + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return queue_stream()->allocate(num_bytes); } + + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { return queue_stream()->allocate_temp(num_bytes); } + + /// deallocate device memory + EIGEN_STRONG_INLINE void deallocate(void *p) const { queue_stream()->deallocate(p); } + + EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { queue_stream()->deallocate_temp(buffer); } + + EIGEN_STRONG_INLINE void deallocate_temp(const void *buffer) const { queue_stream()->deallocate_temp(buffer); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const { + return data; + } + + // some runtime conditions that can be applied here + EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + + /// memcpyHostToDevice + template + EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyHostToDevice(dst, src, n, callback); + } + /// memcpyDeviceToHost + template + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyDeviceToHost(dst, src, n, callback); + } + /// the memcpy function + template + EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + queue_stream()->memcpy(dst, src, n); + } + /// the memset function + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { queue_stream()->memset(data, c, n); } + /// the fill function + template + EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const { + queue_stream()->fill(begin, end, value); + } + /// returning the sycl queue + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return queue_stream()->sycl_queue(); } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on sycl devices. + return firstLevelCacheSize(); + } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return queue_stream()->getNumSyclMultiProcessors(); + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { return queue_stream()->maxSyclThreadsPerBlock(); } + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { return queue_stream()->maxWorkItemSizes(); } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL does not have such a concept + return queue_stream()->maxSyclThreadsPerMultiProcessor(); + } + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return queue_stream()->sharedMemPerBlock(); } + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); + } + + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { + return queue_stream()->getPowerOfTwo(val, roundUp); + } + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return queue_stream()->majorDeviceVersion(); } + + EIGEN_STRONG_INLINE void synchronize() const { queue_stream()->synchronize(); } + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } + + EIGEN_STRONG_INLINE bool has_local_memory() const { return queue_stream()->has_local_memory(); } + EIGEN_STRONG_INLINE long max_buffer_size() const { return queue_stream()->max_buffer_size(); } + EIGEN_STRONG_INLINE std::string getPlatformName() const { return queue_stream()->getPlatformName(); } + EIGEN_STRONG_INLINE std::string getDeviceName() const { return queue_stream()->getDeviceName(); } + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return queue_stream()->getDeviceVendor(); } + template + EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(T... var) const { + return queue_stream()->template binary_kernel_launcher(var...); + } + template + EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(T... var) const { + return queue_stream()->template unary_kernel_launcher(var...); + } + + template + EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(T... var) const { + return queue_stream()->template nullary_kernel_launcher(var...); + } +}; +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h new file mode 100644 index 00000000000..c95c8f22343 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -0,0 +1,376 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +// Runs an arbitrary function and then calls Notify() on the passed in +// Notification. +template +struct FunctionWrapperWithNotification { + static void run(Notification* n, Function f, Args... args) { + f(args...); + if (n) { + n->Notify(); + } + } +}; + +template +struct FunctionWrapperWithBarrier { + static void run(Barrier* b, Function f, Args... args) { + f(args...); + if (b) { + b->Notify(); + } + } +}; + +template +static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { + if (n) { + n->Wait(); + } +} + +// An abstract interface to a device specific memory allocator. +class Allocator { + public: + virtual ~Allocator() {} + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; +}; + +// Build a thread pool device on top the an existing pool of threads. +struct ThreadPoolDevice { + // The ownership of the thread pool remains with the caller. + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr) + : pool_(pool), num_threads_(num_cores), allocator_(allocator) {} + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return allocator_ ? allocator_->allocate(num_bytes) : internal::aligned_malloc(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + if (allocator_) { + allocator_->deallocate(buffer); + } else { + internal::aligned_free(buffer); + } + } + + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return allocate(num_bytes); } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifdef __ANDROID__ + ::memcpy(dst, src, n); +#else + // TODO(rmlarsen): Align blocks on cache lines. + // We have observed that going beyond 4 threads usually just wastes + // CPU cycles due to the threads competing for memory bandwidth, so we + // statically schedule at most 4 block copies here. + const size_t kMinBlockSize = 32768; + const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); + if (n <= kMinBlockSize || num_threads < 2) { + ::memcpy(dst, src, n); + } else { + const char* src_ptr = static_cast(src); + char* dst_ptr = static_cast(dst); + const size_t blocksize = (n + (num_threads - 1)) / num_threads; + Barrier barrier(static_cast(num_threads - 1)); + // Launch the last 3 blocks on worker threads. + for (size_t i = 1; i < num_threads; ++i) { + enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] { + ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, numext::mini(blocksize, n - (i * blocksize))); + }); + } + // Launch the first block on the main thread. + ::memcpy(dst_ptr, src_ptr, blocksize); + barrier.Wait(); + } +#endif + } + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); } + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); } + + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + + template + EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const { + std::fill(begin, end, value); + } + + EIGEN_STRONG_INLINE int numThreads() const { return num_threads_; } + + // Number of theads available in the underlying thread pool. This number can + // be different from the value returned by numThreads(). + EIGEN_STRONG_INLINE int numThreadsInPool() const { return pool_->NumThreads(); } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return l1CacheSize(); } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // The l3 cache size is shared between all the cores. + return l3CacheSize() / num_threads_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { + // Nothing. Threadpool device operations are synchronous. + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + // Should return an enum that encodes the ISA supported by the CPU + return 1; + } + + template + EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { + Notification* n = new Notification(); + pool_->Schedule( + std::bind(&FunctionWrapperWithNotification::run, n, std::forward(f), args...)); + return n; + } + + template + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, Args&&... args) const { + pool_->Schedule( + std::bind(&FunctionWrapperWithBarrier::run, b, std::forward(f), args...)); + } + + template + EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { + if (sizeof...(args) > 0) { + pool_->Schedule(std::bind(std::forward(f), args...)); + } else { + pool_->Schedule(std::forward(f)); + } + } + + // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if + // called from one of the threads in pool_. Returns -1 otherwise. + EIGEN_STRONG_INLINE int currentThreadId() const { return pool_->CurrentThreadId(); } + + // WARNING: This function is synchronous and will block the calling thread. + // + // Synchronous parallelFor executes f with [0, n) arguments in parallel and + // waits for completion. F accepts a half-open interval [first, last). Block + // size is chosen based on the iteration cost and resulting parallel + // efficiency. If block_align is not nullptr, it is called to round up the + // block size. + void parallelFor(Index n, const TensorOpCost& cost, std::function block_align, + std::function f) const { + if (EIGEN_PREDICT_FALSE(n <= 0)) { + return; + // Compute small problems directly in the caller thread. + } else if (n == 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + return; + } + + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(static_cast(block.count)); + std::function handleRange; + handleRange = [this, block, &handleRange, &barrier, &f](Index firstIdx, Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); + lastIdx = midIdx; + } + // Single block or less, execute directly. + f(firstIdx, lastIdx); + barrier.Notify(); + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + handleRange(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); + } + + barrier.Wait(); + } + + // Convenience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, std::function f) const { + parallelFor(n, cost, nullptr, std::move(f)); + } + + // WARNING: This function is asynchronous and will not block the calling thread. + // + // Asynchronous parallelFor executes f with [0, n) arguments in parallel + // without waiting for completion. When the last block finished, it will call + // 'done' callback. F accepts a half-open interval [first, last). Block size + // is chosen based on the iteration cost and resulting parallel efficiency. If + // block_align is not nullptr, it is called to round up the block size. + void parallelForAsync(Index n, const TensorOpCost& cost, std::function block_align, + std::function f, std::function done) const { + // Compute small problems directly in the caller thread. + if (n <= 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + done(); + return; + } + + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + ParallelForAsyncContext* const ctx = new ParallelForAsyncContext(block.count, std::move(f), std::move(done)); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule([ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); + lastIdx = midIdx; + } + + // Single block or less, execute directly. + ctx->f(firstIdx, lastIdx); + + // Delete async context if it was the last block. + if (ctx->count.fetch_sub(1) == 1) delete ctx; + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + ctx->handle_range(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); }); + } + } + + // Convenience wrapper for parallelForAsync that does not align blocks. + void parallelForAsync(Index n, const TensorOpCost& cost, std::function f, + std::function done) const { + parallelForAsync(n, cost, nullptr, std::move(f), std::move(done)); + } + + // Thread pool accessor. + ThreadPoolInterface* getPool() const { return pool_; } + + // Allocator accessor. + Allocator* allocator() const { return allocator_; } + + private: + typedef TensorCostModel CostModel; + + // For parallelForAsync we must keep passed in closures on the heap, and + // delete them only after `done` callback finished. + struct ParallelForAsyncContext { + ParallelForAsyncContext(Index block_count, std::function block_f, + std::function done_callback) + : count(block_count), f(std::move(block_f)), done(std::move(done_callback)) {} + ~ParallelForAsyncContext() { done(); } + + std::atomic count; + std::function f; + std::function done; + + std::function handle_range; + }; + + struct ParallelForBlock { + Index size; // block size + Index count; // number of blocks + }; + + // Calculates block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate parallelization + // overheads; not too large to mitigate tail effect and potential load + // imbalance and we also want number of blocks to be evenly dividable across + // threads. + ParallelForBlock CalculateParallelForBlock(const Index n, const TensorOpCost& cost, + std::function block_align) const { + const double block_size_f = 1.0 / CostModel::taskSize(1, cost); + const Index max_oversharding_factor = 4; + Index block_size = numext::mini( + n, numext::maxi(numext::div_ceil(n, max_oversharding_factor * numThreads()), block_size_f)); + const Index max_block_size = numext::mini(n, 2 * block_size); + + if (block_align) { + Index new_block_size = block_align(block_size); + eigen_assert(new_block_size >= block_size); + block_size = numext::mini(n, new_block_size); + } + + Index block_count = numext::div_ceil(n, block_size); + + // Calculate parallel efficiency as fraction of total CPU time used for + // computations: + double max_efficiency = + static_cast(block_count) / (numext::div_ceil(block_count, numThreads()) * numThreads()); + + // Now try to increase block size up to max_block_size as long as it + // doesn't decrease parallel efficiency. + for (Index prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) { + // This is the next block size that divides size into a smaller number + // of blocks than the current block_size. + Index coarser_block_size = numext::div_ceil(n, prev_block_count - 1); + if (block_align) { + Index new_block_size = block_align(coarser_block_size); + eigen_assert(new_block_size >= coarser_block_size); + coarser_block_size = numext::mini(n, new_block_size); + } + if (coarser_block_size > max_block_size) { + break; // Reached max block size. Stop. + } + // Recalculate parallel efficiency. + const Index coarser_block_count = numext::div_ceil(n, coarser_block_size); + eigen_assert(coarser_block_count < prev_block_count); + prev_block_count = coarser_block_count; + const double coarser_efficiency = static_cast(coarser_block_count) / + (numext::div_ceil(coarser_block_count, numThreads()) * numThreads()); + if (coarser_efficiency + 0.01 >= max_efficiency) { + // Taking it. + block_size = coarser_block_size; + block_count = coarser_block_count; + if (max_efficiency < coarser_efficiency) { + max_efficiency = coarser_efficiency; + } + } + } + + return {block_size, block_count}; + } + + ThreadPoolInterface* pool_; + int num_threads_; + Allocator* allocator_; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h new file mode 100644 index 00000000000..315b556aefc --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h @@ -0,0 +1,119 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \internal + * + * \class TensorDimensionList + * \ingroup CXX11_Tensor_Module + * + * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n. + * + * \sa Tensor + */ + +template +struct DimensionList { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const Index operator[](const Index i) const { return i; } +}; + +namespace internal { + +template +struct array_size > { + static const size_t value = Rank; +}; +template +struct array_size > { + static const size_t value = Rank; +}; + +template +const Index array_get(DimensionList&) { + return n; +} +template +const Index array_get(const DimensionList&) { + return n; +} + +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { return true; } +}; +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { return true; } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { return true; } +}; +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { return true; } +}; + +template +struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { return true; } +}; +template +struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { return true; } +}; + +template +struct index_statically_eq_impl > { + static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i == value; } +}; +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i == value; } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i != value; } +}; +template +struct index_statically_ne_impl > { + static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i != value; } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i > value; } +}; +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i > value; } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i < value; } +}; +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i < value; } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h new file mode 100644 index 00000000000..06d4dfd30dd --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -0,0 +1,329 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \internal + * + * \class TensorDimensions + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode and store the dimensions of a Tensor. + * + * The Sizes class encodes as part of the type the number of dimensions and the + * sizes corresponding to each dimension. It uses no storage space since it is + * entirely known at compile time. + * The DSizes class is its dynamic sibling: the number of dimensions is known + * at compile time but the sizes are set during execution. + * + * \sa Tensor + */ + +// Boilerplate code +namespace internal { + +template +struct dget { + static const std::ptrdiff_t value = get::value; +}; + +template +struct fixed_size_tensor_index_linearization_helper { + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(array const& indices, + const Dimensions& dimensions) { + return array_get < RowMajor ? n - 1 + : (NumIndices - n) > (indices) + dget < RowMajor ? n - 1 + : (NumIndices - n), + Dimensions > ::value * fixed_size_tensor_index_linearization_helper::run( + indices, dimensions); + } +}; + +template +struct fixed_size_tensor_index_linearization_helper { + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(array const&, const Dimensions&) { + return 0; + } +}; + +template +struct fixed_size_tensor_index_extraction_helper { + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(const Index index, const Dimensions& dimensions) { + const Index mult = (index == n - 1) ? 1 : 0; + return array_get(dimensions) * mult + + fixed_size_tensor_index_extraction_helper::run(index, dimensions); + } +}; + +template +struct fixed_size_tensor_index_extraction_helper { + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(const Index, const Dimensions&) { + return 0; + } +}; + +} // end namespace internal + +// Fixed size +template +struct Sizes { + typedef internal::numeric_list Base; + const Base t = Base(); + static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); + static const ptrdiff_t count = Base::count; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { return Base::count; } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() { return internal::arg_prod(Indices...); } + + EIGEN_DEVICE_FUNC Sizes() {} + template + explicit EIGEN_DEVICE_FUNC Sizes(const array& /*indices*/) { + // todo: add assertion + } + template + EIGEN_DEVICE_FUNC Sizes(DenseIndex...) {} + explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list /*l*/) { + // todo: add assertion + } + + template + Sizes& operator=(const T& /*other*/) { + // add assertion failure if the size of other is different + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[](const std::ptrdiff_t index) const { + return internal::fixed_size_tensor_index_extraction_helper::run(index, t); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run( + indices, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run( + indices, t); + } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { + return Sizes::total_size; +} +} // namespace internal + +// Boilerplate +namespace internal { +template +struct tensor_index_linearization_helper { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array const& indices, + array const& dimensions) { + return array_get < RowMajor ? n + : (NumIndices - n - 1) > (indices) + array_get < RowMajor + ? n + : (NumIndices - n - 1) > + (dimensions)*tensor_index_linearization_helper::run( + indices, dimensions); + } +}; + +template +struct tensor_index_linearization_helper { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array const& indices, + array const&) { + return array_get < RowMajor ? 0 : NumIndices - 1 > (indices); + } +}; +} // end namespace internal + +// Dynamic size +template +struct DSizes : array { + typedef array Base; + static const int count = NumDims; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumDims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const { + return (NumDims == 0) ? 1 : internal::array_prod(*static_cast(this)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = 0; + } + } + EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) {} + + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + eigen_assert(NumDims == 1); + (*this)[0] = i0; + } + + EIGEN_DEVICE_FUNC DSizes(const DimensionList& a) { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + + // Enable DSizes index type promotion only if we are promoting to the + // larger type, e.g. allow to promote dimensions of type int to long. + template + EIGEN_DEVICE_FUNC explicit DSizes( + const array& other, + // Default template parameters require c++11. + std::enable_if_t< + internal::is_same::type>::value, + void*> = 0) { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = static_cast(other[i]); + } + } + + template + EIGEN_DEVICE_FUNC explicit DSizes(const Eigen::IndexList& dimensions) { + for (int i = 0; i < dimensions.count; ++i) { + (*this)[i] = dimensions[i]; + } + } + + template + EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, + IndexTypes... otherDimensions) + : Base({{firstDimension, secondDimension, otherDimensions...}}) { + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + EIGEN_DEVICE_FUNC DSizes& operator=(const array& other) { + *static_cast(this) = other; + return *this; + } + + // A constexpr would be so much better here + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run( + indices, *static_cast(this)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run( + indices, *static_cast(this)); + } +}; + +template +std::ostream& operator<<(std::ostream& os, const DSizes& dims) { + os << "["; + for (int i = 0; i < NumDims; ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} + +// Boilerplate +namespace internal { +template +struct tensor_vsize_index_linearization_helper { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array const& indices, + std::vector const& dimensions) { + return array_get < RowMajor ? n + : (NumIndices - n - 1) > (indices) + array_get < RowMajor + ? n + : (NumIndices - n - 1) > + (dimensions)*tensor_vsize_index_linearization_helper::run( + indices, dimensions); + } +}; + +template +struct tensor_vsize_index_linearization_helper { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array const& indices, + std::vector const&) { + return array_get < RowMajor ? 0 : NumIndices - 1 > (indices); + } +}; +} // end namespace internal + +namespace internal { + +template +struct array_size > { + static const ptrdiff_t value = NumDims; +}; +template +struct array_size > { + static const ptrdiff_t value = NumDims; +}; +template +struct array_size > { + static const std::ptrdiff_t value = Sizes::count; +}; +template +struct array_size > { + static const std::ptrdiff_t value = Sizes::count; +}; +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { + return get >::value; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { + eigen_assert(false && "should never be called"); + return -1; +} + +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return false; } +}; +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) { + return numext::equal_strict(array_get(dims1), array_get(dims2)) && + sizes_match_below_dim::run(dims1, dims2); + } +}; +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return true; } +}; + +} // end namespace internal + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) { + return internal::sizes_match_below_dim::value, + internal::array_size::value>::run(dims1, dims2); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h new file mode 100644 index 00000000000..8f1e4c4ba0b --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -0,0 +1,196 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template class MakePointer_> +struct traits > { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename MakePointer_::Type PointerType; + + enum { Flags = 0 }; + template + struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> { + typedef const TensorEvalToOp& type; +}; + +template class MakePointer_> +struct nested, 1, typename eval >::type> { + typedef TensorEvalToOp type; +}; + +} // end namespace internal + +template class MakePointer_> +class TensorEvalToOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename MakePointer_::Type PointerType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + static constexpr int NumDims = Eigen::internal::traits::NumDimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) + : m_xpr(expr), m_buffer(buffer) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; } + + protected: + typename XprType::Nested m_xpr; + PointerType m_buffer; +}; + +template class MakePointer_> +struct TensorEvaluator, Device> { + typedef TensorEvalToOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef std::remove_const_t CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = true, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = true + }; + + static constexpr int Layout = TensorEvaluator::Layout; + static constexpr int NumDims = internal::traits::NumDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + typedef internal::TensorBlockAssignment + TensorBlockAssignment; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()) {} + + EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { + EIGEN_UNUSED_VARIABLE(scalar); + eigen_assert(scalar == NULL); + return m_impl.evalSubExprsIfNeeded(m_buffer); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType scalar, EvalSubExprsCallback done) { + EIGEN_UNUSED_VARIABLE(scalar); + eigen_assert(scalar == NULL); + m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const { m_buffer[i] = m_impl.coeff(i); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const { + internal::pstoret( + m_buffer + i, m_impl.template packet < TensorEvaluator::IsAligned ? Aligned : Unaligned > (i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return m_impl.getResourceRequirements(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlockDesc& desc, TensorBlockScratch& scratch) { + // Add `m_buffer` as destination buffer to the block descriptor. + desc.template AddDestinationBuffer( + /*dst_base=*/m_buffer + desc.offset(), + /*dst_strides=*/internal::strides(m_impl.dimensions())); + + ArgTensorBlock block = m_impl.block(desc, scratch, /*root_of_expr_ast=*/true); + + // If block was evaluated into a destination buffer, there is no need to do + // an assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + TensorBlockAssignment::Run( + TensorBlockAssignment::target(desc.dimensions(), internal::strides(m_impl.dimensions()), m_buffer, + desc.offset()), + block.expr()); + } + block.cleanup(); + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_buffer + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here. + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; } + ArgType expression() const { return m_expression; } + + private: + TensorEvaluator m_impl; + EvaluatorPointerType m_buffer; + const ArgType m_expression; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h new file mode 100644 index 00000000000..d864fb46402 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -0,0 +1,859 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor evaluator classes. + * + * These classes are responsible for the evaluation of the tensor expression. + * + * TODO: add support for more types of expressions, in particular expressions + * leading to lvalues (slicing, reshaping, etc...) + */ + +// Generic evaluator +template +struct TensorEvaluator { + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + typedef Derived XprType; + static constexpr int PacketSize = PacketType::size; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + // NumDimensions is -1 for variable dim tensors + static constexpr int NumCoords = + internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; + static constexpr int Layout = Derived::Layout; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic>::value, + PreferBlockAccess = false, + CoordAccess = NumCoords > 0, + RawAccess = true + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get((const_cast(m.data())))), m_dims(m.dimensions()), m_device(device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { + if (!NumTraits>::RequireInitialization && dest) { + m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); + return false; + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data != NULL); + return m_data[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + eigen_assert(m_data != NULL); + return m_data[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_data + index); + } + + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + std::enable_if_t::masked_load_available, PacketReturnTypeT> + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const { + return internal::ploadu(m_data + index, umask); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + return internal::pstoret(m_data + index, x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data != NULL); + if (static_cast(Layout) == static_cast(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array& coords) const { + eigen_assert(m_data != NULL); + if (static_cast(Layout) == static_cast(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType::size); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) { + eigen_assert(m_data != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), internal::strides(m_dims), m_data, desc.offset()), + block.expr()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } + + protected: + EvaluatorPointerType m_data; + Dimensions m_dims; + const Device EIGEN_DEVICE_REF m_device; +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) { + return *address; +} +// Use the texture cache on CUDA devices whenever possible +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) { + return __ldg(address); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double loadConstant(const double* address) { + return __ldg(address); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half loadConstant(const Eigen::half* address) { + return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); +} +#endif + +} // namespace internal + +// Default evaluator for rvalues +template +struct TensorEvaluator { + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + typedef const Derived XprType; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + typedef std::remove_const_t ScalarNoConst; + + // NumDimensions is -1 for variable dim tensors + static constexpr int NumCoords = + internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; + static constexpr int PacketSize = PacketType::size; + static constexpr int Layout = Derived::Layout; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + PreferBlockAccess = false, + CoordAccess = NumCoords > 0, + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + if (!NumTraits>::RequireInitialization && data) { + m_device.memcpy((void*)(m_device.get(data)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); + return false; + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data != NULL); + return internal::loadConstant(m_data + index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt_ro(m_data + index); + } + + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + std::enable_if_t::masked_load_available, PacketReturnTypeT> + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const { + return internal::ploadu(m_data + index, umask); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data != NULL); + const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) + : m_dims.IndexOfRowMajor(coords); + return internal::loadConstant(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType::size); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } + + protected: + EvaluatorPointerType m_data; + Dimensions m_dims; + const Device EIGEN_DEVICE_REF m_device; +}; + +// -------------------- CwiseNullaryOp -------------------- + +template +struct TensorEvaluator, Device> { + typedef TensorCwiseNullaryOp XprType; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess +#ifdef EIGEN_USE_SYCL + && (PacketType::size > 1) +#endif + , + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + done(true); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_wrapper(m_functor, index); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_wrapper.template packetOp(m_functor, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType::size); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + const NullaryOp m_functor; + TensorEvaluator m_argImpl; + const internal::nullary_wrapper m_wrapper; +}; + +// -------------------- CwiseUnaryOp -------------------- + +template +struct TensorEvaluator, Device> { + typedef TensorCwiseUnaryOp XprType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = + int(TensorEvaluator::PacketAccess) & int(internal::functor_traits::PacketAccess), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef std::remove_const_t ScalarNoConst; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static constexpr int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + typedef internal::TensorCwiseUnaryBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_argImpl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_functor.packetOp(m_argImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.costPerCoeff(vectorized) + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.getResourceRequirements().addCostPerCoeff({0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_argImpl.block(desc, scratch), m_functor); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + const Device EIGEN_DEVICE_REF m_device; + const UnaryOp m_functor; + TensorEvaluator m_argImpl; +}; + +// -------------------- CwiseBinaryOp -------------------- + +template +struct TensorEvaluator, Device> { + typedef TensorCwiseBinaryOp XprType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = + int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_functor(op.functor()), + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout) || + internal::traits::NumDimensions <= 1), + YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int NumDims = internal::array_size::Dimensions>::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock LeftTensorBlock; + typedef typename TensorEvaluator::TensorBlock RightTensorBlock; + + typedef internal::TensorCwiseBinaryBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { + // TODO: use right impl instead if right impl dimensions are known at compile time. + return m_leftImpl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + // TODO(ezhulenev): Evaluate two expression in parallel? + m_leftImpl.evalSubExprsIfNeededAsync( + nullptr, [this, done](bool) { m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { + return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_functor.packetOp(m_leftImpl.template packet(index), + m_rightImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_leftImpl.costPerCoeff(vectorized) + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits::Cost; + return internal::TensorBlockResourceRequirements::merge(m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()) + .addCostPerCoeff({0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + desc.DropDestinationBuffer(); + return TensorBlock(m_leftImpl.block(desc, scratch), m_rightImpl.block(desc, scratch), m_functor); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + const Device EIGEN_DEVICE_REF m_device; + const BinaryOp m_functor; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + +// -------------------- CwiseTernaryOp -------------------- + +template +struct TensorEvaluator, Device> { + typedef TensorCwiseTernaryOp XprType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && internal::functor_traits::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_arg1Impl(op.arg1Expression(), device), + m_arg2Impl(op.arg2Expression(), device), + m_arg3Impl(op.arg3Expression(), device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout) || + internal::traits::NumDimensions <= 1), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + + eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && + dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { + // TODO: use arg2 or arg3 dimensions if they are known at compile time. + return m_arg1Impl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_arg1Impl.evalSubExprsIfNeeded(NULL); + m_arg2Impl.evalSubExprsIfNeeded(NULL); + m_arg3Impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_arg1Impl.cleanup(); + m_arg2Impl.cleanup(); + m_arg3Impl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { + return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_functor.packetOp(m_arg1Impl.template packet(index), m_arg2Impl.template packet(index), + m_arg3Impl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_arg1Impl.costPerCoeff(vectorized) + m_arg2Impl.costPerCoeff(vectorized) + + m_arg3Impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + const TernaryOp m_functor; + TensorEvaluator m_arg1Impl; + TensorEvaluator m_arg2Impl; + TensorEvaluator m_arg3Impl; +}; + +// -------------------- SelectOp -------------------- + +template +struct TensorEvaluator, Device> { + typedef TensorSelectOp XprType; + typedef typename XprType::Scalar Scalar; + + using TernarySelectOp = internal::scalar_boolean_select_op::Scalar, + typename internal::traits::Scalar, + typename internal::traits::Scalar>; + static constexpr bool TernaryPacketAccess = + TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && internal::functor_traits::PacketAccess; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = (TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && PacketType::HasBlend) || + TernaryPacketAccess, + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_condImpl(op.ifExpression(), device), + m_thenImpl(op.thenExpression(), device), + m_elseImpl(op.elseExpression(), device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock IfArgTensorBlock; + typedef typename TensorEvaluator::TensorBlock ThenArgTensorBlock; + typedef typename TensorEvaluator::TensorBlock ElseArgTensorBlock; + + struct TensorSelectOpBlockFactory { + template + struct XprType { + typedef TensorSelectOp type; + }; + + template + typename XprType::type expr(const IfArgXprType& if_expr, + const ThenArgXprType& then_expr, + const ElseArgXprType& else_expr) const { + return typename XprType::type(if_expr, then_expr, else_expr); + } + }; + + typedef internal::TensorTernaryExprBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { + // TODO: use then or else impl instead if they happen to be known at compile time. + return m_condImpl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_condImpl.evalSubExprsIfNeeded(NULL); + m_thenImpl.evalSubExprsIfNeeded(NULL); + m_elseImpl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { + m_thenImpl.evalSubExprsIfNeeded( + nullptr, [this, done](bool) { m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); }); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_condImpl.cleanup(); + m_thenImpl.cleanup(); + m_elseImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { + return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); + } + + template = true> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + internal::Selector select; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index + i); + } + return internal::pblend(select, m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); + } + + template = true> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return TernarySelectOp().template packetOp(m_thenImpl.template packet(index), + m_elseImpl.template packet(index), + m_condImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_condImpl.costPerCoeff(vectorized) + + m_thenImpl.costPerCoeff(vectorized).cwiseMax(m_elseImpl.costPerCoeff(vectorized)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + auto then_req = m_thenImpl.getResourceRequirements(); + auto else_req = m_elseImpl.getResourceRequirements(); + + auto merged_req = internal::TensorBlockResourceRequirements::merge(then_req, else_req); + merged_req.cost_per_coeff = then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff); + + return internal::TensorBlockResourceRequirements::merge(m_condImpl.getResourceRequirements(), merged_req); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // It's unsafe to pass destination buffer to underlying expressions, because + // output might be aliased with one of the inputs. + desc.DropDestinationBuffer(); + + return TensorBlock(m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch), + m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler& cgh) const { + m_condImpl.bind(cgh); + m_thenImpl.bind(cgh); + m_elseImpl.bind(cgh); + } +#endif + private: + TensorEvaluator m_condImpl; + TensorEvaluator m_thenImpl; + TensorEvaluator m_elseImpl; +}; + +} // end namespace Eigen + +#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP) +template +struct cl::sycl::is_device_copyable< + Eigen::TensorEvaluator, + std::enable_if_t>::value>> : std::true_type {}; +#endif + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 00000000000..fe4b38c4813 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,672 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** + * \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + * + * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and + * instructions) + * @tparam Tiling can use block based tensor evaluation + * (see TensorBlock.h) + */ +namespace internal { + +/** + * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely + * expensive. If expression has at least one broadcast op in it, and it supports + * block based evaluation, we always prefer it, even for the small tensors. For + * all other tileable ops, block evaluation overhead for small tensors (fits + * into L1) is too large, and we fallback on vectorized evaluation. + */ + +// TODO(ezhulenev): Add specializations for all other types of Tensor ops. + +template +struct ExpressionHasTensorBroadcastingOp { + enum { value = false }; +}; + +template +struct ExpressionHasTensorBroadcastingOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp > { + enum { + value = ExpressionHasTensorBroadcastingOp::value || ExpressionHasTensorBroadcastingOp::value + }; +}; + +template +struct ExpressionHasTensorBroadcastingOp > { + enum { value = true }; +}; + +// -------------------------------------------------------------------------- // + +/** + * Default strategy: the expression is evaluated sequentially with a single cpu + * thread, without vectorization and block evaluation. + */ +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + + // Including `unsupported/Eigen/CXX11/Tensor` in different translation units + // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR + // violation. If this template is instantiated with a non-default device, it + // means that this header file was included without defining + // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`. + static_assert(std::is_same::value, + "Default executor instantiated with non-default device. " + "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " + "EIGEN_USE_SYCL before including Eigen headers."); + + static EIGEN_STRONG_INLINE void run(const Expression& expr, const Device& device = DefaultDevice()) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + for (StorageIndex i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + +/** + * Default async execution strategy is not implemented. Currently it's only + * available for ThreadPoolDevice (see definition below). + */ +template +class TensorAsyncExecutor {}; + +/** + * Process all the data with a single cpu thread, using vectorized instructions. + */ +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + const int PacketSize = + unpacket_traits::PacketReturnType>::size; + + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not + // unroll the loop at the expense of inlining. + const StorageIndex UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; + for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (StorageIndex i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + +/** + * Process all the data with a single cpu thread, using blocks of data. By + * sizing a block to fit L1 cache we get better cache performance. + */ +template +class TensorExecutor { + public: + typedef typename traits::Scalar Scalar; + typedef std::remove_const_t ScalarNoConst; + + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; + + static constexpr int NumDims = traits::NumDimensions; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + typedef TensorBlockMapper TensorBlockMapper; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + Evaluator evaluator(expr, device); + + // TODO(ezhulenev): Do not use tiling for small tensors? + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + + if (needs_assign) { + // Query expression tree for desired block size/shape. + const TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements(); + + const TensorBlockMapper block_mapper(typename TensorBlockDesc::Dimensions(evaluator.dimensions()), requirements); + + // Share scratch memory allocator between all blocks. + TensorBlockScratch scratch(device); + + const StorageIndex total_block_count = block_mapper.blockCount(); + for (StorageIndex i = 0; i < total_block_count; ++i) { + TensorBlockDesc desc = block_mapper.blockDescriptor(i); + evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + } + evaluator.cleanup(); + } +}; + +/** + * Multicore strategy: the index space is partitioned and each partition is + * executed on a single core. + * + * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread + * pool, and will block the caller thread until all tasks are finished. + * + * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to + * the ThreadPoolDevice managed thread pool, and will return immediately. + * It will call 'done' callback after all tasks are finished. + */ +#ifdef EIGEN_USE_THREADS + +template +struct TensorExecutorTilingContext { + TensorExecutorTilingContext() = default; + TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, const TensorOpCost& b_cost, size_t b_aligned_size) + : block_mapper(b_mapper), cost(b_cost), aligned_blocksize(b_aligned_size) {} + + TensorBlockMapper block_mapper; // navigate through blocks + TensorOpCost cost; // cost of computing a single block + size_t aligned_blocksize; // block size after memory alignment +}; + +// Computes a block evaluation parameters, and allocates temporary memory buffer +// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. +template +TensorExecutorTilingContext GetTensorExecutorTilingContext(const Evaluator& evaluator) { + // Query expression tree for desired block size/shape. + TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements(); + + // Update target block size based on cost model. + double taskSize = TensorCostModel::taskSize(1, requirements.cost_per_coeff); + requirements.size = static_cast(1.0 / taskSize); + + TensorBlockMapper block_mapper(typename TensorBlockMapper::Dimensions(evaluator.dimensions()), requirements); + + size_t block_size = block_mapper.blockTotalSize(); + const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + const size_t aligned_blocksize = + align * numext::div_ceil(block_size * sizeof(typename Evaluator::Scalar), align); + + return {block_mapper, requirements.cost_per_coeff * block_size, aligned_blocksize}; +} + +template +struct EvalRange { + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) { + Evaluator evaluator = *evaluator_in; + eigen_assert(lastIdx >= firstIdx); + for (StorageIndex i = firstIdx; i < lastIdx; ++i) { + evaluator.evalScalar(i); + } + } + + static StorageIndex alignBlockSize(StorageIndex size) { return size; } +}; + +template +struct EvalRange { + static constexpr int PacketSize = unpacket_traits::size; + + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) { + Evaluator evaluator = *evaluator_in; + eigen_assert(lastIdx >= firstIdx); + StorageIndex i = firstIdx; + if (lastIdx - firstIdx >= PacketSize) { + eigen_assert(firstIdx % PacketSize == 0); + StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize; + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not + // unroll the loop at the expense of inlining. + for (; i <= last_chunk_offset; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + last_chunk_offset = lastIdx - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { + evaluator.evalPacket(i); + } + } + for (; i < lastIdx; ++i) { + evaluator.evalScalar(i); + } + } + + static StorageIndex alignBlockSize(StorageIndex size) { + // Align block size to packet size and account for unrolling in run above. + if (size >= 16 * PacketSize) { + return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); + } + // Aligning to 4 * PacketSize would increase block size by more than 25%. + return (size + PacketSize - 1) & ~(PacketSize - 1); + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { + typedef TensorEvaluator Evaluator; + typedef EvalRange EvalRange; + + Evaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + device.parallelFor( + size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize, + [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&evaluator, firstIdx, lastIdx); }); + } + evaluator.cleanup(); + } +}; + +template +class TensorExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef std::remove_const_t ScalarNoConst; + + static constexpr int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { + Evaluator evaluator(expr, device); + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const TilingContext tiling = + internal::GetTensorExecutorTilingContext(evaluator); + + auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, IndexType lastBlockIdx) { + TensorBlockScratch scratch(device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) { + TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx); + evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + }; + + // Evaluate small expressions directly as a single block. + if (tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(device); + TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions()); + evaluator.evalBlock(desc, scratch); + } else { + device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, eval_block); + } + } + evaluator.cleanup(); + } +}; + +template +class TensorAsyncExecutor { + public: + typedef typename Expression::Index StorageIndex; + typedef TensorEvaluator Evaluator; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) { + TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done)); + + const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } + + typedef EvalRange EvalRange; + const StorageIndex size = array_prod(ctx->evaluator.dimensions()); + device.parallelForAsync( + size, ctx->evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize, + [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); }, + [ctx]() { delete ctx; }); + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done) + : evaluator(expr, thread_pool), on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + evaluator.cleanup(); + on_done(); + } + + Evaluator evaluator; + + private: + DoneCallback on_done; + }; +}; + +template +class TensorAsyncExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef std::remove_const_t ScalarNoConst; + + static constexpr int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) { + TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done)); + + const auto on_eval_subexprs = [ctx](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } + + ctx->tiling = internal::GetTensorExecutorTilingContext(ctx->evaluator); + + auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) { + TensorBlockScratch scratch(ctx->device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) { + TensorBlockDesc desc = ctx->tiling.block_mapper.blockDescriptor(block_idx); + ctx->evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + }; + + // Evaluate small expressions directly as a single block. + if (ctx->tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(ctx->device); + TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions()); + ctx->evaluator.evalBlock(desc, scratch); + delete ctx; + } else { + ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), ctx->tiling.cost, eval_block, + [ctx]() { delete ctx; }); + } + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done) + : device(thread_pool), evaluator(expr, thread_pool), on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + evaluator.cleanup(); + on_done(); + } + + const ThreadPoolDevice& device; + Evaluator evaluator; + TilingContext tiling; + + private: + DoneCallback on_done; + }; +}; + +#endif // EIGEN_USE_THREADS + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) + +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + static void run(const Expression& expr, const GpuDevice& device); +}; + +#if defined(EIGEN_GPUCC) +// Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0. +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs, Index rhs) { + const Index highest = NumTraits::highest(); + const Index lowest = NumTraits::lowest(); + if (lhs > 0 && rhs > 0) { + return lhs > highest - rhs ? 1 : 0; + } else if (lhs < 0 && rhs < 0) { + return lhs < lowest - rhs ? -1 : 0; + } else { + return 0; + } +} + +// Returns lhs + rhs, saturating to the highest/lowest representable value on +// overflow/underflow respectively. +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) { + const Index highest = NumTraits::highest(); + const Index lowest = NumTraits::lowest(); + int overflow = sum_will_overflow(lhs, rhs); + return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs; +} + +// A functor that adds step_size to a given index, saturating to avoid +// overflow/underflow. If overflow/underflow is not possible, regular addition +// is used (for efficiency). +template +struct SafeStep { + // lastIdx is one past the end of the possible indexes. + // step_size is the value that will be added to the given index when the + // functor is called. + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size) + : can_overflow_(sum_will_overflow(lastIdx, step_size)), step_size_(step_size) {} + + // Adds step_size to index, saturating on overflow (if overflow is possible). + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const { + return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_; + } + + private: + const bool can_overflow_; + const Index step_size_; +}; + +template +struct EigenMetaKernelEval { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, + StorageIndex step_size) { + SafeStep safe_step(lastIdx, step_size); + for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) { + eval.evalScalar(i); + } + } +}; + +template +struct EigenMetaKernelEval { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, + StorageIndex step_size) { + const StorageIndex PacketSize = unpacket_traits::size; + const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize; + const StorageIndex vectorized_step_size = step_size * PacketSize; + + SafeStep safe_vectorized_step(vectorized_size, vectorized_step_size); + // Use the vector path + for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; i = safe_vectorized_step(i)) { + eval.evalPacket(i); + } + SafeStep safe_step(lastIdx, step_size); + for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx; i = safe_step(i)) { + eval.evalScalar(i); + } + } +}; + +template +__global__ void __launch_bounds__(1024) EigenMetaKernel(Evaluator eval, StorageIndex size) { + const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; + const StorageIndex step_size = blockDim.x * gridDim.x; + + const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; + EigenMetaKernelEval::run(eval, first_index, size, step_size); +} + +/*static*/ +template +EIGEN_STRONG_INLINE void TensorExecutor::run(const Expression& expr, + const GpuDevice& device) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const int block_size = device.maxGpuThreadsPerBlock(); + const int max_blocks = static_cast( + numext::mini(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(), + NumTraits::highest()) / + block_size); + const StorageIndex size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi( + numext::mini(max_blocks, static_cast(numext::div_ceil(size, block_size))), 1); + + LAUNCH_GPU_KERNEL((EigenMetaKernel, StorageIndex>), num_blocks, block_size, + 0, device, evaluator, size); + } + evaluator.cleanup(); +} + +#endif // EIGEN_GPUCC +#endif // EIGEN_USE_GPU + +// SYCL Executor policy +#ifdef EIGEN_USE_SYCL + +template +struct ExecExprFunctorKernel { + typedef typename Evaluator::Index Index; + Evaluator evaluator; + const Index range; + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(const Scratch, Evaluator evaluator_, const Index range_) + : evaluator(evaluator_), range(range_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { compute(itemID); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t compute(const cl::sycl::nd_item<1>& itemID) const { + Index gId = static_cast(itemID.get_global_linear_id()); + Index total_threads = itemID.get_global_range(0); + + for (Index i = gId; i < range; i += total_threads) { + evaluator.evalScalar(i); + } + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t compute(const cl::sycl::nd_item<1>& itemID) const { + const Index vectorizedRange = (range / Evaluator::PacketSize) * Evaluator::PacketSize; + Index gId = static_cast(itemID.get_global_linear_id()); + const Index step = Evaluator::PacketSize * itemID.get_global_range(0); + const Index start = Evaluator::PacketSize * gId; + for (Index i = start; i < vectorizedRange; i += step) { + evaluator.evalPacket(i); + } + gId += vectorizedRange; + for (Index i = gId; i < range; i += itemID.get_global_range(0)) { + evaluator.evalScalar(i); + } + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static EIGEN_STRONG_INLINE void run(const Expression& expr, const Eigen::SyclDevice& dev) { + typedef Eigen::TensorEvaluator Evaluator; + Evaluator evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + Index range, GRange, tileSize; + Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); + total_size = (total_size == 0) ? 1 : total_size; + const int PacketSize = Eigen::PacketType::size; + Index vectorizable_threads = static_cast(total_size / PacketSize); + dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); + range = total_size; + + dev.template nullary_kernel_launcher >( + evaluator, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1), + range) + .wait(); + } + evaluator.cleanup(); + } +}; + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h new file mode 100644 index 00000000000..5f4c72367c6 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -0,0 +1,329 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorExpr + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor expression classes. + * + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. + * This is typically used to generate constants. + * + * The TensorCwiseUnaryOp class represents an expression where a unary operator + * (e.g. cwiseSqrt) is applied to an expression. + * + * The TensorCwiseBinaryOp class represents an expression where a binary + * operator (e.g. addition) is applied to a lhs and a rhs expression. + * + */ +namespace internal { +template +struct traits > : traits { + typedef traits XprTraits; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested XprTypeNested; + typedef std::remove_reference_t XprTypeNested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + enum { Flags = 0 }; +}; + +} // end namespace internal + +template +class TensorCwiseNullaryOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef TensorCwiseNullaryOp Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; } + + protected: + typename XprType::Nested m_xpr; + const NullaryOp m_functor; +}; + +namespace internal { +template +struct traits > : traits { + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename result_of::type Scalar; + typedef traits XprTraits; + typedef typename XprType::Nested XprTypeNested; + typedef std::remove_reference_t XprTypeNested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename TypeConversion::type PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorCwiseUnaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorCwiseUnaryOp type; +}; + +} // end namespace internal + +template +class TensorCwiseUnaryOp : public TensorBase, ReadOnlyAccessors> { + public: + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC const UnaryOp& functor() const { return m_functor; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC const internal::remove_all_t& nestedExpression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const UnaryOp m_functor; +}; + +namespace internal { +template +struct traits > { + // Type promotion to handle the case where the types of the lhs and the rhs + // are different. + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename result_of::type Scalar; + typedef traits XprTraits; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef std::remove_reference_t LhsNested_; + typedef std::remove_reference_t RhsNested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType> >::type PointerType; + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorCwiseBinaryOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorCwiseBinaryOp type; +}; + +} // end namespace internal + +template +class TensorCwiseBinaryOp + : public TensorBase, ReadOnlyAccessors> { + public: + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, + const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + + EIGEN_DEVICE_FUNC const BinaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC const internal::remove_all_t& lhsExpression() const { + return m_lhs_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& rhsExpression() const { + return m_rhs_xpr; + } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const BinaryOp m_functor; +}; + +namespace internal { +template +struct traits > { + // Type promotion to handle the case where the types of the args are different. + typedef typename result_of::type Scalar; + typedef traits XprTraits; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename Arg1XprType::Nested Arg1Nested; + typedef typename Arg2XprType::Nested Arg2Nested; + typedef typename Arg3XprType::Nested Arg3Nested; + typedef std::remove_reference_t Arg1Nested_; + typedef std::remove_reference_t Arg2Nested_; + typedef std::remove_reference_t Arg3Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType> >::type PointerType; + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorCwiseTernaryOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorCwiseTernaryOp type; +}; + +} // end namespace internal + +template +class TensorCwiseTernaryOp + : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, + const Arg3XprType& arg3, + const TernaryOp& func = TernaryOp()) + : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {} + + EIGEN_DEVICE_FUNC const TernaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC const internal::remove_all_t& arg1Expression() const { + return m_arg1_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& arg2Expression() const { + return m_arg2_xpr; + } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& arg3Expression() const { + return m_arg3_xpr; + } + + protected: + typename Arg1XprType::Nested m_arg1_xpr; + typename Arg2XprType::Nested m_arg2_xpr; + typename Arg3XprType::Nested m_arg3_xpr; + const TernaryOp m_functor; +}; + +namespace internal { +template +struct traits > : traits { + typedef typename traits::Scalar Scalar; + typedef traits XprTraits; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef + typename promote_index_type::Index, typename traits::Index>::type Index; + typedef typename IfXprType::Nested IfNested; + typedef typename ThenXprType::Nested ThenNested; + typedef typename ElseXprType::Nested ElseNested; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef std::conditional_t::val, + typename traits::PointerType, typename traits::PointerType> + PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorSelectOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorSelectOp type; +}; + +} // end namespace internal + +template +class TensorSelectOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC TensorSelectOp(const IfXprType& a_condition, const ThenXprType& a_then, const ElseXprType& a_else) + : m_condition(a_condition), m_then(a_then), m_else(a_else) {} + + EIGEN_DEVICE_FUNC const IfXprType& ifExpression() const { return m_condition; } + + EIGEN_DEVICE_FUNC const ThenXprType& thenExpression() const { return m_then; } + + EIGEN_DEVICE_FUNC const ElseXprType& elseExpression() const { return m_else; } + + protected: + typename IfXprType::Nested m_condition; + typename ThenXprType::Nested m_then; + typename ElseXprType::Nested m_else; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h new file mode 100644 index 00000000000..f5d1b2da357 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -0,0 +1,674 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Jianwei Cui +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H +#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorFFT + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor FFT class. + * + * TODO: + * Vectorize the Cooley Tukey and the Bluestein algorithm + * Add support for multithreaded evaluation + * Improve the performance on GPU + */ + +template +struct MakeComplex { + template + EIGEN_DEVICE_FUNC T operator()(const T& val) const { + return val; + } +}; + +template <> +struct MakeComplex { + template + EIGEN_DEVICE_FUNC std::complex operator()(const T& val) const { + return std::complex(val, 0); + } +}; + +template <> +struct MakeComplex { + template + EIGEN_DEVICE_FUNC std::complex operator()(const std::complex& val) const { + return val; + } +}; + +template +struct PartOf { + template + T operator()(const T& val) const { + return val; + } +}; + +template <> +struct PartOf { + template + T operator()(const std::complex& val) const { + return val.real(); + } +}; + +template <> +struct PartOf { + template + T operator()(const std::complex& val) const { + return val.imag(); + } +}; + +namespace internal { +template +struct traits > : public traits { + typedef traits XprTraits; + typedef typename NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename XprTraits::Scalar InputScalar; + typedef std::conditional_t + OutputScalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename traits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorFFTOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorFFTOp type; +}; + +} // end namespace internal + +template +class TensorFFTOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef std::conditional_t + OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) : m_xpr(expr), m_fft(fft) {} + + EIGEN_DEVICE_FUNC const FFT& fft() const { return m_fft; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const FFT m_fft; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorFFTOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef internal::traits XprTraits; + typedef typename XprTraits::Scalar InputScalar; + typedef std::conditional_t + OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = true, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + } + } + m_size = m_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (data) { + evalToBuf(data); + return false; + } else { + m_data = (EvaluatorPointerType)m_device.get( + (CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size))); + evalToBuf(m_data); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_data) { + m_device.deallocate(m_data); + m_data = NULL; + } + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { return m_data[index]; } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) { + const bool write_to_out = internal::is_same::value; + ComplexScalar* buf = + write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); + + for (Index i = 0; i < m_size; ++i) { + buf[i] = MakeComplex::value>()(m_impl.coeff(i)); + } + + for (size_t i = 0; i < m_fft.size(); ++i) { + Index dim = m_fft[i]; + eigen_assert(dim >= 0 && dim < NumDims); + Index line_len = m_dimensions[dim]; + eigen_assert(line_len >= 1); + ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len); + const bool is_power_of_two = isPowerOfTwo(line_len); + const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len); + const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite); + + ComplexScalar* a = + is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* b = + is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* pos_j_base_powered = + is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1)); + if (!is_power_of_two) { + // Compute twiddle factors + // t_n = exp(sqrt(-1) * pi * n^2 / line_len) + // for n = 0, 1,..., line_len-1. + // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 + + // The recurrence is correct in exact arithmetic, but causes + // numerical issues for large transforms, especially in + // single-precision floating point. + // + // pos_j_base_powered[0] = ComplexScalar(1, 0); + // if (line_len > 1) { + // const ComplexScalar pos_j_base = ComplexScalar( + // numext::cos(M_PI / line_len), numext::sin(M_PI / line_len)); + // pos_j_base_powered[1] = pos_j_base; + // if (line_len > 2) { + // const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + // for (int i = 2; i < line_len + 1; ++i) { + // pos_j_base_powered[i] = pos_j_base_powered[i - 1] * + // pos_j_base_powered[i - 1] / + // pos_j_base_powered[i - 2] * + // pos_j_base_sq; + // } + // } + // } + // TODO(rmlarsen): Find a way to use Eigen's vectorized sin + // and cosine functions here. + for (int j = 0; j < line_len + 1; ++j) { + double arg = ((EIGEN_PI * j) * j) / line_len; + std::complex tmp(numext::cos(arg), numext::sin(arg)); + pos_j_base_powered[j] = static_cast(tmp); + } + } + + for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { + const Index base_offset = getBaseOffsetFromIndex(partial_index, dim); + + // get data into line_buf + const Index stride = m_strides[dim]; + if (stride == 1) { + m_device.memcpy(line_buf, &buf[base_offset], line_len * sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + for (int j = 0; j < line_len; ++j, offset += stride) { + line_buf[j] = buf[offset]; + } + } + + // process the line + if (is_power_of_two) { + processDataLineCooleyTukey(line_buf, line_len, log_len); + } else { + processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered); + } + + // write back + if (FFTDir == FFT_FORWARD && stride == 1) { + m_device.memcpy(&buf[base_offset], line_buf, line_len * sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); + for (int j = 0; j < line_len; ++j, offset += stride) { + buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor; + } + } + } + m_device.deallocate(line_buf); + if (!is_power_of_two) { + m_device.deallocate(a); + m_device.deallocate(b); + m_device.deallocate(pos_j_base_powered); + } + } + + if (!write_to_out) { + for (Index i = 0; i < m_size; ++i) { + data[i] = PartOf()(buf[i]); + } + m_device.deallocate(buf); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) { + eigen_assert(x > 0); + return !(x & (x - 1)); + } + + // The composite number for padding, used in Bluestein's FFT algorithm + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) { + Index i = 2; + while (i < 2 * n - 1) i *= 2; + return i; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) { + Index log2m = 0; + while (m >>= 1) log2m++; + return log2m; + } + + // Call Cooley Tukey algorithm directly, data length must be power of 2 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, + Index log_len) { + eigen_assert(isPowerOfTwo(line_len)); + scramble_FFT(line_buf, line_len); + compute_1D_Butterfly(line_buf, line_len, log_len); + } + + // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, + Index good_composite, Index log_len, + ComplexScalar* a, ComplexScalar* b, + const ComplexScalar* pos_j_base_powered) { + Index n = line_len; + Index m = good_composite; + ComplexScalar* data = line_buf; + + for (Index i = 0; i < n; ++i) { + if (FFTDir == FFT_FORWARD) { + a[i] = data[i] * numext::conj(pos_j_base_powered[i]); + } else { + a[i] = data[i] * pos_j_base_powered[i]; + } + } + for (Index i = n; i < m; ++i) { + a[i] = ComplexScalar(0, 0); + } + + for (Index i = 0; i < n; ++i) { + if (FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[i]; + } else { + b[i] = numext::conj(pos_j_base_powered[i]); + } + } + for (Index i = n; i < m - n; ++i) { + b[i] = ComplexScalar(0, 0); + } + for (Index i = m - n; i < m; ++i) { + if (FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[m - i]; + } else { + b[i] = numext::conj(pos_j_base_powered[m - i]); + } + } + + scramble_FFT(a, m); + compute_1D_Butterfly(a, m, log_len); + + scramble_FFT(b, m); + compute_1D_Butterfly(b, m, log_len); + + for (Index i = 0; i < m; ++i) { + a[i] *= b[i]; + } + + scramble_FFT(a, m); + compute_1D_Butterfly(a, m, log_len); + + // Do the scaling after ifft + for (Index i = 0; i < m; ++i) { + a[i] /= m; + } + + for (Index i = 0; i < n; ++i) { + if (FFTDir == FFT_FORWARD) { + data[i] = a[i] * numext::conj(pos_j_base_powered[i]); + } else { + data[i] = a[i] * pos_j_base_powered[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) { + eigen_assert(isPowerOfTwo(n)); + Index j = 1; + for (Index i = 1; i < n; ++i) { + if (j > i) { + std::swap(data[j - 1], data[i - 1]); + } + Index m = n >> 1; + while (m >= 2 && j > m) { + j -= m; + m >>= 1; + } + j += m; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) { + ComplexScalar tmp = data[1]; + data[1] = data[0] - data[1]; + data[0] += tmp; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) { + ComplexScalar tmp[4]; + tmp[0] = data[0] + data[1]; + tmp[1] = data[0] - data[1]; + tmp[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); + } else { + tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); + } + data[0] = tmp[0] + tmp[2]; + data[1] = tmp[1] + tmp[3]; + data[2] = tmp[0] - tmp[2]; + data[3] = tmp[1] - tmp[3]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) { + ComplexScalar tmp_1[8]; + ComplexScalar tmp_2[8]; + + tmp_1[0] = data[0] + data[1]; + tmp_1[1] = data[0] - data[1]; + tmp_1[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); + } else { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); + } + tmp_1[4] = data[4] + data[5]; + tmp_1[5] = data[4] - data[5]; + tmp_1[6] = data[6] + data[7]; + if (Dir == FFT_FORWARD) { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); + } else { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); + } + tmp_2[0] = tmp_1[0] + tmp_1[2]; + tmp_2[1] = tmp_1[1] + tmp_1[3]; + tmp_2[2] = tmp_1[0] - tmp_1[2]; + tmp_2[3] = tmp_1[1] - tmp_1[3]; + tmp_2[4] = tmp_1[4] + tmp_1[6]; +// SQRT2DIV2 = sqrt(2)/2 +#define SQRT2DIV2 0.7071067811865476 + if (Dir == FFT_FORWARD) { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); + } else { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); + } + data[0] = tmp_2[0] + tmp_2[4]; + data[1] = tmp_2[1] + tmp_2[5]; + data[2] = tmp_2[2] + tmp_2[6]; + data[3] = tmp_2[3] + tmp_2[7]; + data[4] = tmp_2[0] - tmp_2[4]; + data[5] = tmp_2[1] - tmp_2[5]; + data[6] = tmp_2[2] - tmp_2[6]; + data[7] = tmp_2[3] - tmp_2[7]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(ComplexScalar* data, Index n, Index n_power_of_2) { + // Original code: + // RealScalar wtemp = std::sin(M_PI/n); + // RealScalar wpi = -std::sin(2 * M_PI/n); + const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; + const RealScalar wpi = + (Dir == FFT_FORWARD) ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; + + const ComplexScalar wp(wtemp, wpi); + const ComplexScalar wp_one = wp + ComplexScalar(1, 0); + const ComplexScalar wp_one_2 = wp_one * wp_one; + const ComplexScalar wp_one_3 = wp_one_2 * wp_one; + const ComplexScalar wp_one_4 = wp_one_3 * wp_one; + const Index n2 = n / 2; + ComplexScalar w(1.0, 0.0); + for (Index i = 0; i < n2; i += 4) { + ComplexScalar temp0(data[i + n2] * w); + ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); + ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); + ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3); + w = w * wp_one_4; + + data[i + n2] = data[i] - temp0; + data[i] += temp0; + + data[i + 1 + n2] = data[i + 1] - temp1; + data[i + 1] += temp1; + + data[i + 2 + n2] = data[i + 2] - temp2; + data[i + 2] += temp2; + + data[i + 3 + n2] = data[i + 3] - temp3; + data[i + 3] += temp3; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(ComplexScalar* data, Index n, Index n_power_of_2) { + eigen_assert(isPowerOfTwo(n)); + if (n > 8) { + compute_1D_Butterfly(data, n / 2, n_power_of_2 - 1); + compute_1D_Butterfly(data + n / 2, n / 2, n_power_of_2 - 1); + butterfly_1D_merge(data, n, n_power_of_2); + } else if (n == 8) { + butterfly_8(data); + } else if (n == 4) { + butterfly_4(data); + } else if (n == 2) { + butterfly_2(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const { + Index result = 0; + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > omitted_dim; --i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } else { + for (Index i = 0; i < omitted_dim; ++i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } + // Value of index_coords[omitted_dim] is not determined to this step + return result; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const { + Index result = base + offset * m_strides[omitted_dim]; + return result; + } + + protected: + Index m_size; + const FFT EIGEN_DEVICE_REF m_fft; + Dimensions m_dimensions; + array m_strides; + TensorEvaluator m_impl; + EvaluatorPointerType m_data; + const Device EIGEN_DEVICE_REF m_device; + + // This will support a maximum FFT size of 2^32 for each dimension + // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; + const RealScalar m_sin_PI_div_n_LUT[32] = {RealScalar(0.0), + RealScalar(-2), + RealScalar(-0.999999999999999), + RealScalar(-0.292893218813453), + RealScalar(-0.0761204674887130), + RealScalar(-0.0192147195967696), + RealScalar(-0.00481527332780311), + RealScalar(-0.00120454379482761), + RealScalar(-3.01181303795779e-04), + RealScalar(-7.52981608554592e-05), + RealScalar(-1.88247173988574e-05), + RealScalar(-4.70619042382852e-06), + RealScalar(-1.17654829809007e-06), + RealScalar(-2.94137117780840e-07), + RealScalar(-7.35342821488550e-08), + RealScalar(-1.83835707061916e-08), + RealScalar(-4.59589268710903e-09), + RealScalar(-1.14897317243732e-09), + RealScalar(-2.87243293150586e-10), + RealScalar(-7.18108232902250e-11), + RealScalar(-1.79527058227174e-11), + RealScalar(-4.48817645568941e-12), + RealScalar(-1.12204411392298e-12), + RealScalar(-2.80511028480785e-13), + RealScalar(-7.01277571201985e-14), + RealScalar(-1.75319392800498e-14), + RealScalar(-4.38298482001247e-15), + RealScalar(-1.09574620500312e-15), + RealScalar(-2.73936551250781e-16), + RealScalar(-6.84841378126949e-17), + RealScalar(-1.71210344531737e-17), + RealScalar(-4.28025861329343e-18)}; + + // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); + const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {RealScalar(0.0), + RealScalar(0.0), + RealScalar(-1.00000000000000e+00), + RealScalar(-7.07106781186547e-01), + RealScalar(-3.82683432365090e-01), + RealScalar(-1.95090322016128e-01), + RealScalar(-9.80171403295606e-02), + RealScalar(-4.90676743274180e-02), + RealScalar(-2.45412285229123e-02), + RealScalar(-1.22715382857199e-02), + RealScalar(-6.13588464915448e-03), + RealScalar(-3.06795676296598e-03), + RealScalar(-1.53398018628477e-03), + RealScalar(-7.66990318742704e-04), + RealScalar(-3.83495187571396e-04), + RealScalar(-1.91747597310703e-04), + RealScalar(-9.58737990959773e-05), + RealScalar(-4.79368996030669e-05), + RealScalar(-2.39684498084182e-05), + RealScalar(-1.19842249050697e-05), + RealScalar(-5.99211245264243e-06), + RealScalar(-2.99605622633466e-06), + RealScalar(-1.49802811316901e-06), + RealScalar(-7.49014056584716e-07), + RealScalar(-3.74507028292384e-07), + RealScalar(-1.87253514146195e-07), + RealScalar(-9.36267570730981e-08), + RealScalar(-4.68133785365491e-08), + RealScalar(-2.34066892682746e-08), + RealScalar(-1.17033446341373e-08), + RealScalar(-5.85167231706864e-09), + RealScalar(-2.92583615853432e-09)}; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h new file mode 100644 index 00000000000..7f229cb063d --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -0,0 +1,226 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H +#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorFixedSize + * \ingroup CXX11_Tensor_Module + * + * \brief The fixed sized version of the tensor class. + * + * The fixed sized equivalent of + * Eigen::Tensor t(3, 5, 7); + * is + * Eigen::TensorFixedSize> t; + */ + +template +class TensorFixedSize : public TensorBase > { + public: + typedef TensorFixedSize Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + static constexpr int Options = Options_; + static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + + enum { + IsAligned = bool(EIGEN_MAX_ALIGN_BYTES > 0), + PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = true, + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + typedef Dimensions_ Dimensions; + static constexpr std::size_t NumIndices = Dimensions::count; + + protected: + TensorStorage m_storage; + + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, otherIndices...}}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, otherIndices...}}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, otherIndices...}}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, otherIndices...}}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize() : m_storage() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) : Base(other), m_storage(other.m_storage) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) : m_storage(other.m_storage) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize) + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool checkIndexRange(const array& /*indices*/) const { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::lesser_op; + using internal::logical_and_op; + + return true; + // check whether the indices are all >= 0 + /* array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions());*/ + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const { + if (Options & RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h new file mode 100644 index 00000000000..d0fbfb38a89 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -0,0 +1,233 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits> { + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorForcedEvalOp& type; +}; + +template +struct nested, 1, typename eval>::type> { + typedef TensorForcedEvalOp type; +}; + +} // end namespace internal + +template +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +namespace internal { +template +struct non_integral_type_placement_new { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) { + // Initialize non-trivially constructible types. + if (!internal::is_arithmetic::value) { + for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType(); + } + } +}; + +// SYCL does not support non-integral types +// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices +// no matching function for call to 'operator new' +template +struct non_integral_type_placement_new { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {} +}; +} // end namespace internal + +template +class DeviceTempPointerHolder { + public: + DeviceTempPointerHolder(const Device& device, size_t size) + : device_(device), size_(size), ptr_(device.allocate_temp(size)) {} + + ~DeviceTempPointerHolder() { + device_.deallocate_temp(ptr_); + size_ = 0; + ptr_ = nullptr; + } + + void* ptr() { return ptr_; } + + private: + Device device_; + size_t size_; + void* ptr_; +}; + +template +struct TensorEvaluator, Device> { + typedef const internal::remove_all_t ArgType; + typedef TensorForcedEvalOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + PreferBlockAccess = false, + RawAccess = true + }; + + static constexpr int Layout = TensorEvaluator::Layout; + static constexpr int NumDims = internal::traits::NumDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), + m_op(op.expression()), + m_device(device), + m_buffer_holder(nullptr), + m_buffer(nullptr) {} + + ~TensorEvaluator() { cleanup(); } + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + const Index numValues = internal::array_prod(m_impl.dimensions()); + m_buffer_holder = std::make_shared>(m_device, numValues * sizeof(CoeffReturnType)); + m_buffer = static_cast(m_buffer_holder->ptr()); + + internal::non_integral_type_placement_new()(numValues, m_buffer); + + typedef TensorEvalToOp> EvalTo; + EvalTo evalToTmp(m_device.get(m_buffer), m_op); + + internal::TensorExecutor, + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>::run(evalToTmp, m_device); + + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + const Index numValues = internal::array_prod(m_impl.dimensions()); + m_buffer_holder = std::make_shared>(m_device, numValues * sizeof(CoeffReturnType)); + m_buffer = static_cast(m_buffer_holder->ptr()); + + typedef TensorEvalToOp> EvalTo; + EvalTo evalToTmp(m_device.get(m_buffer), m_op); + + auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); }, std::move(done)); + internal::TensorAsyncExecutor< + const EvalTo, std::remove_const_t, decltype(on_done), + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>::runAsync(evalToTmp, m_device, + std::move(on_done)); + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { + m_buffer_holder = nullptr; + m_buffer = nullptr; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_buffer + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_buffer != nullptr); + return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EvaluatorPointerType data() const { return m_buffer; } + + private: + TensorEvaluator m_impl; + const ArgType m_op; + const Device EIGEN_DEVICE_REF m_device; + std::shared_ptr> m_buffer_holder; + EvaluatorPointerType m_buffer; // Cached copy of the value stored in m_buffer_holder. +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h new file mode 100644 index 00000000000..49c20a4e285 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -0,0 +1,215 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +// MakePointer class is used as a container of the address space of the pointer +// on the host and on the device. From the host side it generates the T* pointer +// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to +// T* m_data on the host. It is always called on the device. +// Specialisation of MakePointer class for creating the sycl buffer with +// map_allocator. +template +struct MakePointer { + typedef T* Type; + typedef const T* ConstType; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) { + return const_cast(data); +} + +// The StorageMemory class is a container of the device specific pointer +// used for referring to a Pointer on TensorEvaluator class. While the TensorExpression +// is a device-agnostic type and need MakePointer class for type conversion, +// the TensorEvaluator class can be specialized for a device, hence it is possible +// to construct different types of temporary storage memory in TensorEvaluator +// for different devices by specializing the following StorageMemory class. +template +struct StorageMemory : MakePointer {}; + +namespace internal { +template +struct Pointer_type_promotion { + static const bool val = false; +}; +template +struct Pointer_type_promotion { + static const bool val = true; +}; +template +struct TypeConversion { + typedef A* type; +}; +} // namespace internal + +template class MakePointer_ = MakePointer> +class TensorMap; +template +class Tensor; +template +class TensorFixedSize; +template +class TensorRef; +template +class TensorBase; + +template +class TensorCwiseNullaryOp; +template +class TensorCwiseUnaryOp; +template +class TensorCwiseBinaryOp; +template +class TensorCwiseTernaryOp; +template +class TensorSelectOp; +template class MakePointer_ = MakePointer> +class TensorReductionOp; +template +class TensorIndexPairOp; +template +class TensorPairReducerOp; +template +class TensorConcatenationOp; +template +class TensorContractionOp; +template +class TensorConversionOp; +template +class TensorConvolutionOp; +template +class TensorFFTOp; +template +class TensorPatchOp; +template +class TensorImagePatchOp; +template +class TensorVolumePatchOp; +template +class TensorBroadcastingOp; +template +class TensorChippingOp; +template +class TensorReshapingOp; +template +class TensorLayoutSwapOp; +template +class TensorSlicingOp; +template +class TensorReverseOp; +template +class TensorRollOp; +template +class TensorPaddingOp; +template +class TensorShufflingOp; +template +class TensorStridingOp; +template +class TensorStridingSlicingOp; +template +class TensorInflationOp; +template +class TensorGeneratorOp; +template +class TensorAssignOp; +template +class TensorScanOp; +template +class TensorTraceOp; + +template +class TensorCustomUnaryOp; +template +class TensorCustomBinaryOp; + +template class MakePointer_ = MakePointer> +class TensorEvalToOp; +template +class TensorForcedEvalOp; + +template +class TensorDevice; +template +class TensorAsyncDevice; +template +struct TensorEvaluator; + +struct NoOpOutputKernel; + +struct DefaultDevice; +struct ThreadPoolDevice; +struct GpuDevice; +struct SyclDevice; + +#ifdef EIGEN_USE_SYCL +namespace TensorSycl { +namespace internal { +template +class GenericNondeterministicReducer; +} +} // namespace TensorSycl +#endif + +enum FFTResultType { RealPart = 0, ImagPart = 1, BothParts = 2 }; + +enum FFTDirection { FFT_FORWARD = 0, FFT_REVERSE = 1 }; + +namespace internal { + +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess; +}; + +template +struct IsVectorizable { + static const bool value = + TensorEvaluator::PacketAccess && TensorEvaluator::IsAligned; +}; + +// Tiled evaluation strategy. +enum TiledEvaluation { + Off = 0, // tiled evaluation is not supported + On = 1, // still work in progress (see TensorBlock.h) +}; + +template +struct IsTileable { + // Check that block evaluation is supported and it's a preferred option (at + // least one sub-expression has much faster block evaluation, e.g. + // broadcasting). + static constexpr bool BlockAccess = + TensorEvaluator::BlockAccess && TensorEvaluator::PreferBlockAccess; + + static const TiledEvaluation value = BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off; +}; + +template ::value, + TiledEvaluation Tiling = IsTileable::value> +class TensorExecutor; + +template ::value, + TiledEvaluation Tiling = IsTileable::value> +class TensorAsyncExecutor; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h new file mode 100644 index 00000000000..7a87594c525 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -0,0 +1,422 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/** \internal + * \brief Template functor to compute the modulo between an array and a scalar. + */ +template +struct scalar_mod_op { + EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return a % m_divisor; } + const Scalar m_divisor; +}; +template +struct functor_traits > { + enum { Cost = scalar_div_cost::value, PacketAccess = false }; +}; + +/** \internal + * \brief Template functor to compute the modulo between 2 arrays. + */ +template +struct scalar_mod2_op { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { return a % b; } +}; +template +struct functor_traits > { + enum { Cost = scalar_div_cost::value, PacketAccess = false }; +}; + +template +struct scalar_fmod_op { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { + return numext::fmod(a, b); + } +}; +template +struct functor_traits > { + enum { + Cost = 13, // Reciprocal throughput of FPREM on Haswell. + PacketAccess = false + }; +}; + +template +struct reducer_traits { + enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true }; +}; + +// Standard reduction functors +template +struct SumReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + internal::scalar_sum_op sum_op; + *accum = sum_op(*accum, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = padd(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_sum_op sum_op; + return sum_op(saccum, predux(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasAdd, + IsStateful = false, + IsExactlyAssociative = NumTraits::IsInteger + }; +}; + +template +struct MeanReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MeanReducer() : scalarCount_(0), packetCount_(0) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + internal::scalar_sum_op sum_op; + *accum = sum_op(*accum, t); + scalarCount_++; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + (*accum) = padd(*accum, p); + packetCount_++; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + internal::scalar_quotient_op quotient_op; + return quotient_op(accum, T(scalarCount_)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, pset1(T(packetCount_))); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_sum_op sum_op; + internal::scalar_quotient_op quotient_op; + return quotient_op(sum_op(saccum, predux(vaccum)), T(scalarCount_ + packetCount_ * unpacket_traits::size)); + } + + protected: + DenseIndex scalarCount_; + DenseIndex packetCount_; +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasAdd && PacketType::HasDiv && !NumTraits::IsInteger, + IsStateful = true, + IsExactlyAssociative = NumTraits::IsInteger + }; +}; + +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits::lowest(); } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return -Eigen::NumTraits::infinity(); } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits::highest(); } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits::infinity(); } +}; + +template +struct MaxReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + scalar_max_op op; + *accum = op(t, *accum); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + scalar_max_op op; + (*accum) = op.packetOp(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return MinMaxBottomValue::IsInteger>::bottom_value(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + scalar_max_op op; + return op(saccum, op.predux(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasMax, + IsStateful = false, + IsExactlyAssociative = (NaNPropagation != PropagateFast) + }; +}; + +template +struct MinReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + scalar_min_op op; + *accum = op(t, *accum); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + scalar_min_op op; + (*accum) = op.packetOp(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return MinMaxBottomValue::IsInteger>::bottom_value(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + scalar_min_op op; + return op(saccum, op.predux(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasMin, + IsStateful = false, + IsExactlyAssociative = (NaNPropagation != PropagateFast) + }; +}; + +template +struct ProdReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + internal::scalar_product_op prod_op; + (*accum) = prod_op(*accum, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmul(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(1); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_product_op prod_op; + return prod_op(saccum, predux_mul(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::MulCost, + PacketAccess = PacketType::HasMul, + IsStateful = false, + IsExactlyAssociative = true + }; +}; + +struct AndReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum && t; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { return accum; } +}; + +template +struct reducer_traits { + enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true }; +}; + +struct OrReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum || t; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { return false; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { return accum; } +}; + +template +struct reducer_traits { + enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true }; +}; + +// Argmin/Argmax reducers. Returns the first occurrence if multiple locations +// contain the same min/max value. +template +struct ArgMaxPairReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t.second < accum->second) { + return; + } else if (t.second > accum->second || accum->first > t.first) { + *accum = t; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits::lowest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { return accum; } +}; + +template +struct reducer_traits, Device> { + enum { Cost = NumTraits::AddCost, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true }; +}; + +template +struct ArgMinPairReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { + if (t.second > accum->second) { + return; + } else if (t.second < accum->second || accum->first > t.first) { + *accum = t; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits::highest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { return accum; } +}; + +template +struct reducer_traits, Device> { + enum { Cost = NumTraits::AddCost, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true }; +}; + +template +class GaussianGenerator { + public: + static constexpr bool PacketAccess = false; + + EIGEN_DEVICE_FUNC GaussianGenerator(const array& means, const array& std_devs) + : m_means(means) { + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < NumDims; ++i) { + m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; + } + } + + EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { + T tmp = T(0); + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < NumDims; ++i) { + T offset = coordinates[i] - m_means[i]; + tmp += offset * offset / m_two_sigmas[i]; + } + return numext::exp(-tmp); + } + + private: + array m_means; + array m_two_sigmas; +}; + +template +struct functor_traits > { + enum { + Cost = NumDims * + (2 * NumTraits::AddCost + NumTraits::MulCost + functor_traits >::Cost) + + functor_traits >::Cost, + PacketAccess = GaussianGenerator::PacketAccess + }; +}; + +template +struct scalar_clamp_op { + EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + return numext::mini(numext::maxi(x, m_min), m_max); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x) const { + return internal::pmin(internal::pmax(x, pset1(m_min)), pset1(m_max)); + } + const Scalar m_min; + const Scalar m_max; +}; +template +struct functor_traits > { + enum { + Cost = 2 * NumTraits::AddCost, + PacketAccess = (packet_traits::HasMin && packet_traits::HasMax) + }; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h new file mode 100644 index 00000000000..b038eaf787a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -0,0 +1,271 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorGeneratorOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor generator class. + * + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorGeneratorOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorGeneratorOp type; +}; + +} // end namespace internal + +template +class TensorGeneratorOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator) + : m_xpr(expr), m_generator(generator) {} + + EIGEN_DEVICE_FUNC const Generator& generator() const { return m_generator; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Generator m_generator; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorGeneratorOp XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator::Dimensions Dimensions; + static constexpr int NumDims = internal::array_size::value; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = true, + PreferBlockAccess = true, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef internal::TensorIntDivisor IndexDivisor; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_generator(op.generator()) { + TensorEvaluator argImpl(op.expression(), device); + m_dimensions = argImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + EIGEN_UNROLL_LOOP + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); + } + } else { + m_strides[NumDims - 1] = 1; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { return true; } + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + array coords; + extract_coordinates(index, coords); + return m_generator(coords); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + const int packetSize = PacketType::size; + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX std::remove_const_t values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.firstLevelCacheSize(); + // TODO(ezhulenev): Generator should have a cost. + return internal::TensorBlockResourceRequirements::skewed(target_size); + } + + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + static const bool is_col_major = static_cast(Layout) == static_cast(ColMajor); + + // Compute spatial coordinates for the first block element. + array coords; + extract_coordinates(desc.offset(), coords); + array initial_coords = coords; + + // Offset in the output block buffer. + Index offset = 0; + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims; ++i) { + const int dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride); + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + eigen_assert(it[0].stride == 1); + + // Prepare storage for the materialized generator result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + + CoeffReturnType* block_buffer = block_storage.data(); + + static const int packet_size = PacketType::size; + + static const int inner_dim = is_col_major ? 0 : NumDims - 1; + const Index inner_dim_size = it[0].size; + const Index inner_dim_vectorized = inner_dim_size - packet_size; + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + Index i = 0; + // Generate data for the vectorized part of the inner-most dimension. + for (; i <= inner_dim_vectorized; i += packet_size) { + for (Index j = 0; j < packet_size; ++j) { + array j_coords = coords; // Break loop dependence. + j_coords[inner_dim] += j; + *(block_buffer + offset + i + j) = m_generator(j_coords); + } + coords[inner_dim] += packet_size; + } + // Finalize non-vectorized part of the inner-most dimension. + for (; i < inner_dim_size; ++i) { + *(block_buffer + offset + i) = m_generator(coords); + coords[inner_dim]++; + } + coords[inner_dim] = initial_coords[inner_dim]; + + // For the 1d tensor we need to generate only one inner-most dimension. + if (NumDims == 1) break; + + // Update offset. + for (i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + // TODO(rmlarsen): This is just a placeholder. Define interface to make + // generators return their cost. + return TensorOpCost(0, 0, TensorOpCost::AddCost() + TensorOpCost::MulCost()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array& coords) const { + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[NumDims - 1] = index; + } + } + + const Device EIGEN_DEVICE_REF m_device; + Dimensions m_dimensions; + array m_strides; + array m_fast_strides; + Generator m_generator; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h new file mode 100644 index 00000000000..6a1240cfacd --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h @@ -0,0 +1,36 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors. + * + * This function computes the regularized incomplete beta function (integral). + * + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseTernaryOp, + const ADerived, const BDerived, const XDerived> +betainc(const Eigen::TensorBase& a, + const Eigen::TensorBase& b, + const Eigen::TensorBase& x) { + return TensorCwiseTernaryOp, const ADerived, const BDerived, + const XDerived>(a.derived(), b.derived(), x.derived(), + internal::scalar_betainc_op()); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h new file mode 100644 index 00000000000..3073272a24f --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -0,0 +1,101 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) +#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design +// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but +// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler +// When compiling such files, gcc will end up trying to pick up the CUDA headers by +// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU) +// This will obviously not work when trying to compile tensorflow on a system with no CUDA +// To work around this issue for HIP systems (and leave the default behaviour intact), the +// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and +// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is +// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well + +#if defined(EIGEN_USE_HIP) + +#define gpuStream_t hipStream_t +#define gpuDeviceProp_t hipDeviceProp_t +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess +#define gpuErrorNotReady hipErrorNotReady +#define gpuGetDeviceCount hipGetDeviceCount +#define gpuGetLastError hipGetLastError +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorName hipGetErrorName +#define gpuGetErrorString hipGetErrorString +#define gpuGetDeviceProperties hipGetDeviceProperties +#define gpuStreamDefault hipStreamDefault +#define gpuGetDevice hipGetDevice +#define gpuSetDevice hipSetDevice +#define gpuMalloc hipMalloc +#define gpuFree hipFree +#define gpuMemsetAsync hipMemsetAsync +#define gpuMemset2DAsync hipMemset2DAsync +#define gpuMemcpyAsync hipMemcpyAsync +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuStreamQuery hipStreamQuery +#define gpuSharedMemConfig hipSharedMemConfig +#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig +#define gpuStreamSynchronize hipStreamSynchronize +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuMemcpy hipMemcpy + +#else + +#define gpuStream_t cudaStream_t +#define gpuDeviceProp_t cudaDeviceProp +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess +#define gpuErrorNotReady cudaErrorNotReady +#define gpuGetDeviceCount cudaGetDeviceCount +#define gpuGetLastError cudaGetLastError +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorName cudaGetErrorName +#define gpuGetErrorString cudaGetErrorString +#define gpuGetDeviceProperties cudaGetDeviceProperties +#define gpuStreamDefault cudaStreamDefault +#define gpuGetDevice cudaGetDevice +#define gpuSetDevice cudaSetDevice +#define gpuMalloc cudaMalloc +#define gpuFree cudaFree +#define gpuMemsetAsync cudaMemsetAsync +#define gpuMemset2DAsync cudaMemset2DAsync +#define gpuMemcpyAsync cudaMemcpyAsync +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuStreamQuery cudaStreamQuery +#define gpuSharedMemConfig cudaSharedMemConfig +#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig +#define gpuStreamSynchronize cudaStreamSynchronize +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuMemcpy cudaMemcpy + +#endif + +// gpu_assert can be overridden +#ifndef gpu_assert + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +// HIPCC do not support the use of assert on the GPU side. +#define gpu_assert(COND) +#else +#define gpu_assert(COND) eigen_assert(COND) +#endif + +#endif // gpu_assert + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h new file mode 100644 index 00000000000..509bcee0b87 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h @@ -0,0 +1,45 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) + +#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + +#undef gpuStream_t +#undef gpuDeviceProp_t +#undef gpuError_t +#undef gpuSuccess +#undef gpuErrorNotReady +#undef gpuGetDeviceCount +#undef gpuGetErrorString +#undef gpuGetDeviceProperties +#undef gpuStreamDefault +#undef gpuGetDevice +#undef gpuSetDevice +#undef gpuMalloc +#undef gpuFree +#undef gpuMemsetAsync +#undef gpuMemset2DAsync +#undef gpuMemcpyAsync +#undef gpuMemcpyDeviceToDevice +#undef gpuMemcpyDeviceToHost +#undef gpuMemcpyHostToDevice +#undef gpuStreamQuery +#undef gpuSharedMemConfig +#undef gpuDeviceSetSharedMemConfig +#undef gpuStreamSynchronize +#undef gpuDeviceSynchronize +#undef gpuMemcpy + +#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + +#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h new file mode 100644 index 00000000000..0bdb1ab7985 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -0,0 +1,413 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H +#define EIGEN_CXX11_TENSOR_TENSOR_IO_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +struct TensorIOFormat; + +namespace internal { +template +struct TensorPrinter; +} + +template +struct TensorIOFormatBase { + using Derived = Derived_; + TensorIOFormatBase(const std::vector& separator, const std::vector& prefix, + const std::vector& suffix, int precision = StreamPrecision, int flags = 0, + const std::string& tenPrefix = "", const std::string& tenSuffix = "", const char fill = ' ') + : tenPrefix(tenPrefix), + tenSuffix(tenSuffix), + prefix(prefix), + suffix(suffix), + separator(separator), + fill(fill), + precision(precision), + flags(flags) { + init_spacer(); + } + + void init_spacer() { + if ((flags & DontAlignCols)) return; + spacer.resize(prefix.size()); + spacer[0] = ""; + int i = int(tenPrefix.length()) - 1; + while (i >= 0 && tenPrefix[i] != '\n') { + spacer[0] += ' '; + i--; + } + + for (std::size_t k = 1; k < prefix.size(); k++) { + int j = int(prefix[k].length()) - 1; + while (j >= 0 && prefix[k][j] != '\n') { + spacer[k] += ' '; + j--; + } + } + } + + std::string tenPrefix; + std::string tenSuffix; + std::vector prefix; + std::vector suffix; + std::vector separator; + char fill; + int precision; + int flags; + std::vector spacer{}; +}; + +struct TensorIOFormatNumpy : public TensorIOFormatBase { + using Base = TensorIOFormatBase; + TensorIOFormatNumpy() + : Base(/*separator=*/{" ", "\n"}, /*prefix=*/{"", "["}, /*suffix=*/{"", "]"}, /*precision=*/StreamPrecision, + /*flags=*/0, /*tenPrefix=*/"[", /*tenSuffix=*/"]") {} +}; + +struct TensorIOFormatNative : public TensorIOFormatBase { + using Base = TensorIOFormatBase; + TensorIOFormatNative() + : Base(/*separator=*/{", ", ",\n", "\n"}, /*prefix=*/{"", "{"}, /*suffix=*/{"", "}"}, + /*precision=*/StreamPrecision, /*flags=*/0, /*tenPrefix=*/"{", /*tenSuffix=*/"}") {} +}; + +struct TensorIOFormatPlain : public TensorIOFormatBase { + using Base = TensorIOFormatBase; + TensorIOFormatPlain() + : Base(/*separator=*/{" ", "\n", "\n", ""}, /*prefix=*/{""}, /*suffix=*/{""}, /*precision=*/StreamPrecision, + /*flags=*/0, /*tenPrefix=*/"", /*tenSuffix=*/"") {} +}; + +struct TensorIOFormatLegacy : public TensorIOFormatBase { + using Base = TensorIOFormatBase; + TensorIOFormatLegacy() + : Base(/*separator=*/{", ", "\n"}, /*prefix=*/{"", "["}, /*suffix=*/{"", "]"}, /*precision=*/StreamPrecision, + /*flags=*/0, /*tenPrefix=*/"", /*tenSuffix=*/"") {} +}; + +struct TensorIOFormat : public TensorIOFormatBase { + using Base = TensorIOFormatBase; + TensorIOFormat(const std::vector& separator, const std::vector& prefix, + const std::vector& suffix, int precision = StreamPrecision, int flags = 0, + const std::string& tenPrefix = "", const std::string& tenSuffix = "", const char fill = ' ') + : Base(separator, prefix, suffix, precision, flags, tenPrefix, tenSuffix, fill) {} + + static inline const TensorIOFormatNumpy Numpy() { return TensorIOFormatNumpy{}; } + + static inline const TensorIOFormatPlain Plain() { return TensorIOFormatPlain{}; } + + static inline const TensorIOFormatNative Native() { return TensorIOFormatNative{}; } + + static inline const TensorIOFormatLegacy Legacy() { return TensorIOFormatLegacy{}; } +}; + +template +class TensorWithFormat; +// specialize for Layout=ColMajor, Layout=RowMajor and rank=0. +template +class TensorWithFormat { + public: + TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {} + + friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat& wf) { + // Evaluate the expression if needed + typedef TensorEvaluator, DefaultDevice> Evaluator; + TensorForcedEvalOp eval = wf.t_tensor.eval(); + Evaluator tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + internal::TensorPrinter::run(os, tensor, wf.t_format); + // Cleanup. + tensor.cleanup(); + return os; + } + + protected: + T t_tensor; + Format t_format; +}; + +template +class TensorWithFormat { + public: + TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {} + + friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat& wf) { + // Switch to RowMajor storage and print afterwards + typedef typename T::Index IndexType; + std::array shuffle; + std::array id; + std::iota(id.begin(), id.end(), IndexType(0)); + std::copy(id.begin(), id.end(), shuffle.rbegin()); + auto tensor_row_major = wf.t_tensor.swap_layout().shuffle(shuffle); + + // Evaluate the expression if needed + typedef TensorEvaluator, DefaultDevice> Evaluator; + TensorForcedEvalOp eval = tensor_row_major.eval(); + Evaluator tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + internal::TensorPrinter::run(os, tensor, wf.t_format); + // Cleanup. + tensor.cleanup(); + return os; + } + + protected: + T t_tensor; + Format t_format; +}; + +template +class TensorWithFormat { + public: + TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {} + + friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat& wf) { + // Evaluate the expression if needed + typedef TensorEvaluator, DefaultDevice> Evaluator; + TensorForcedEvalOp eval = wf.t_tensor.eval(); + Evaluator tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + internal::TensorPrinter::run(os, tensor, wf.t_format); + // Cleanup. + tensor.cleanup(); + return os; + } + + protected: + T t_tensor; + Format t_format; +}; + +namespace internal { + +// Default scalar printer. +template +struct ScalarPrinter { + static void run(std::ostream& stream, const Scalar& scalar, const Format&) { stream << scalar; } +}; + +template +struct ScalarPrinter::IsComplex>> { + static void run(std::ostream& stream, const Scalar& scalar, const TensorIOFormatNumpy&) { + stream << numext::real(scalar) << "+" << numext::imag(scalar) << "j"; + } +}; + +template +struct ScalarPrinter::IsComplex>> { + static void run(std::ostream& stream, const Scalar& scalar, const TensorIOFormatNative&) { + stream << "{" << numext::real(scalar) << ", " << numext::imag(scalar) << "}"; + } +}; + +template +struct TensorPrinter { + using Scalar = std::remove_const_t; + + static void run(std::ostream& s, const Tensor& tensor, const Format& fmt) { + typedef typename Tensor::Index IndexType; + + eigen_assert(Tensor::Layout == RowMajor); + typedef std::conditional_t::value || is_same::value || + is_same::value || is_same::value, + int, + std::conditional_t>::value || + is_same>::value || + is_same>::value || + is_same>::value, + std::complex, const Scalar&>> + PrintType; + + const IndexType total_size = array_prod(tensor.dimensions()); + + std::streamsize explicit_precision; + if (fmt.precision == StreamPrecision) { + explicit_precision = 0; + } else if (fmt.precision == FullPrecision) { + if (NumTraits::IsInteger) { + explicit_precision = 0; + } else { + explicit_precision = significant_decimals_impl::run(); + } + } else { + explicit_precision = fmt.precision; + } + + std::streamsize old_precision = 0; + if (explicit_precision) old_precision = s.precision(explicit_precision); + + IndexType width = 0; + bool align_cols = !(fmt.flags & DontAlignCols); + if (align_cols) { + // compute the largest width + for (IndexType i = 0; i < total_size; i++) { + std::stringstream sstr; + sstr.copyfmt(s); + ScalarPrinter::run(sstr, static_cast(tensor.data()[i]), fmt); + width = std::max(width, IndexType(sstr.str().length())); + } + } + s << fmt.tenPrefix; + for (IndexType i = 0; i < total_size; i++) { + std::array is_at_end{}; + std::array is_at_begin{}; + + // is the ith element the end of an coeff (always true), of a row, of a matrix, ...? + for (std::size_t k = 0; k < rank; k++) { + if ((i + 1) % (std::accumulate(tensor.dimensions().rbegin(), tensor.dimensions().rbegin() + k, 1, + std::multiplies())) == + 0) { + is_at_end[k] = true; + } + } + + // is the ith element the begin of an coeff (always true), of a row, of a matrix, ...? + for (std::size_t k = 0; k < rank; k++) { + if (i % (std::accumulate(tensor.dimensions().rbegin(), tensor.dimensions().rbegin() + k, 1, + std::multiplies())) == + 0) { + is_at_begin[k] = true; + } + } + + // do we have a line break? + bool is_at_begin_after_newline = false; + for (std::size_t k = 0; k < rank; k++) { + if (is_at_begin[k]) { + std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1; + if (fmt.separator[separator_index].find('\n') != std::string::npos) { + is_at_begin_after_newline = true; + } + } + } + + bool is_at_end_before_newline = false; + for (std::size_t k = 0; k < rank; k++) { + if (is_at_end[k]) { + std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1; + if (fmt.separator[separator_index].find('\n') != std::string::npos) { + is_at_end_before_newline = true; + } + } + } + + std::stringstream suffix, prefix, separator; + for (std::size_t k = 0; k < rank; k++) { + std::size_t suffix_index = (k < fmt.suffix.size()) ? k : fmt.suffix.size() - 1; + if (is_at_end[k]) { + suffix << fmt.suffix[suffix_index]; + } + } + for (std::size_t k = 0; k < rank; k++) { + std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1; + if (is_at_end[k] && + (!is_at_end_before_newline || fmt.separator[separator_index].find('\n') != std::string::npos)) { + separator << fmt.separator[separator_index]; + } + } + for (std::size_t k = 0; k < rank; k++) { + std::size_t spacer_index = (k < fmt.spacer.size()) ? k : fmt.spacer.size() - 1; + if (i != 0 && is_at_begin_after_newline && (!is_at_begin[k] || k == 0)) { + prefix << fmt.spacer[spacer_index]; + } + } + for (int k = rank - 1; k >= 0; k--) { + std::size_t prefix_index = (static_cast(k) < fmt.prefix.size()) ? k : fmt.prefix.size() - 1; + if (is_at_begin[k]) { + prefix << fmt.prefix[prefix_index]; + } + } + + s << prefix.str(); + // So we don't mess around with formatting, output scalar to a string stream, and adjust the width/fill manually. + std::stringstream sstr; + sstr.copyfmt(s); + ScalarPrinter::run(sstr, static_cast(tensor.data()[i]), fmt); + std::string scalar_str = sstr.str(); + IndexType scalar_width = scalar_str.length(); + if (width && scalar_width < width) { + std::string filler; + for (IndexType j = scalar_width; j < width; ++j) { + filler.push_back(fmt.fill); + } + s << filler; + } + s << scalar_str; + s << suffix.str(); + if (i < total_size - 1) { + s << separator.str(); + } + } + s << fmt.tenSuffix; + if (explicit_precision) s.precision(old_precision); + } +}; + +template +struct TensorPrinter> { + using Format = TensorIOFormatLegacy; + using Scalar = std::remove_const_t; + + static void run(std::ostream& s, const Tensor& tensor, const Format&) { + typedef typename Tensor::Index IndexType; + // backwards compatibility case: print tensor after reshaping to matrix of size dim(0) x + // (dim(1)*dim(2)*...*dim(rank-1)). + const IndexType total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + const IndexType first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); + Map> matrix(tensor.data(), first_dim, + total_size / first_dim); + s << matrix; + return; + } + } +}; + +template +struct TensorPrinter { + static void run(std::ostream& s, const Tensor& tensor, const Format& fmt) { + using Scalar = std::remove_const_t; + + std::streamsize explicit_precision; + if (fmt.precision == StreamPrecision) { + explicit_precision = 0; + } else if (fmt.precision == FullPrecision) { + if (NumTraits::IsInteger) { + explicit_precision = 0; + } else { + explicit_precision = significant_decimals_impl::run(); + } + } else { + explicit_precision = fmt.precision; + } + + std::streamsize old_precision = 0; + if (explicit_precision) old_precision = s.precision(explicit_precision); + s << fmt.tenPrefix; + ScalarPrinter::run(s, tensor.coeff(0), fmt); + s << fmt.tenSuffix; + if (explicit_precision) s.precision(old_precision); + } +}; + +} // end namespace internal +template +std::ostream& operator<<(std::ostream& s, const TensorBase& t) { + s << t.format(TensorIOFormat::Plain()); + return s; +} +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h new file mode 100644 index 00000000000..291301a8193 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -0,0 +1,590 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorImagePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for image processing. + * This assumes that the input has a least 3 dimensions ordered as follow: + * 1st dimension: channels (of size d) + * 2nd dimension: rows (of size r) + * 3rd dimension: columns (of size c) + * There can be additional dimensions such as time (for video) or batch (for + * bulk processing after the first 3. + * Calling the image patch code with patch_rows and patch_cols is equivalent + * to calling the regular patch extraction code with parameters d, patch_rows, + * patch_cols, and 1 for all the additional dimensions. + */ +namespace internal { + +template +struct traits > : public traits { + typedef std::remove_const_t Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions + 1; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorImagePatchOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorImagePatchOp type; +}; + +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Self& self, const Index num_coeff_to_copy, + const Index dst_index, Scalar* dst_data, + const Index src_index) { + const Impl& impl = self.impl(); + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Self& self, const Index num_coeff_to_copy, + const Index dst_index, Scalar* dst_data, + const Index src_index) { + const Impl& impl = self.impl(); + const Index packet_size = internal::unpacket_traits::size; + const Index vectorized_size = (num_coeff_to_copy / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = impl.template packet(src_index + i); + internal::pstoret(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchPaddingOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Index num_coeff_to_pad, const Scalar padding_value, + const Index dst_index, Scalar* dst_data) { + const Index packet_size = internal::unpacket_traits::size; + const Packet padded_packet = internal::pset1(padding_value); + const Index vectorized_size = (num_coeff_to_pad / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + internal::pstoret(dst_data + dst_index + i, padded_packet); + } + for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) { + dst_data[dst_index + i] = padding_value; + } + } +}; + +} // end namespace internal + +template +class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, + DenseIndex patch_cols, DenseIndex row_strides, + DenseIndex col_strides, DenseIndex in_row_strides, + DenseIndex in_col_strides, DenseIndex row_inflate_strides, + DenseIndex col_inflate_strides, PaddingType padding_type, + Scalar padding_value) + : m_xpr(expr), + m_patch_rows(patch_rows), + m_patch_cols(patch_cols), + m_row_strides(row_strides), + m_col_strides(col_strides), + m_in_row_strides(in_row_strides), + m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), + m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), + m_padding_top(0), + m_padding_bottom(0), + m_padding_left(0), + m_padding_right(0), + m_padding_type(padding_type), + m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, + DenseIndex patch_cols, DenseIndex row_strides, + DenseIndex col_strides, DenseIndex in_row_strides, + DenseIndex in_col_strides, DenseIndex row_inflate_strides, + DenseIndex col_inflate_strides, DenseIndex padding_top, + DenseIndex padding_bottom, DenseIndex padding_left, + DenseIndex padding_right, Scalar padding_value) + : m_xpr(expr), + m_patch_rows(patch_rows), + m_patch_cols(patch_cols), + m_row_strides(row_strides), + m_col_strides(col_strides), + m_in_row_strides(in_row_strides), + m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), + m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), + m_padding_top(padding_top), + m_padding_bottom(padding_bottom), + m_padding_left(padding_left), + m_padding_right(padding_right), + m_padding_type(PADDING_VALID), + m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC DenseIndex in_row_strides() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC DenseIndex in_col_strides() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC bool padding_explicit() const { return m_padding_explicit; } + EIGEN_DEVICE_FUNC DenseIndex padding_top() const { return m_padding_top; } + EIGEN_DEVICE_FUNC DenseIndex padding_bottom() const { return m_padding_bottom; } + EIGEN_DEVICE_FUNC DenseIndex padding_left() const { return m_padding_left; } + EIGEN_DEVICE_FUNC DenseIndex padding_right() const { return m_padding_right; } + EIGEN_DEVICE_FUNC PaddingType padding_type() const { return m_padding_type; } + EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const DenseIndex m_in_row_strides; + const DenseIndex m_in_col_strides; + const DenseIndex m_row_inflate_strides; + const DenseIndex m_col_inflate_strides; + const bool m_padding_explicit; + const DenseIndex m_padding_top; + const DenseIndex m_padding_bottom; + const DenseIndex m_padding_left; + const DenseIndex m_padding_right; + const PaddingType m_padding_type; + const Scalar m_padding_value; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorImagePatchOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + static constexpr int NumDims = NumInputDims + 1; + typedef DSizes Dimensions; + typedef std::remove_const_t Scalar; + typedef TensorEvaluator, Device> Self; + typedef TensorEvaluator Impl; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = true, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_impl(op.expression(), device) { + EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + + m_paddingValue = op.padding_value(); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // Caches a few variables. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputDepth = input_dims[0]; + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + } else { + m_inputDepth = input_dims[NumInputDims - 1]; + m_inputRows = input_dims[NumInputDims - 2]; + m_inputCols = input_dims[NumInputDims - 3]; + } + + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // Input strides and effective input/patch size + m_in_row_strides = op.in_row_strides(); + m_in_col_strides = op.in_col_strides(); + m_row_inflate_strides = op.row_inflate_strides(); + m_col_inflate_strides = op.col_inflate_strides(); + // The "effective" input rows and input cols are the input rows and cols + // after inflating them with zeros. + // For examples, a 2x3 matrix with row_inflate_strides and + // col_inflate_strides of 2 comes from: + // A B C + // D E F + // + // to a matrix is 3 x 5: + // + // A . B . C + // . . . . . + // D . E . F + + m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; + m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; + m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); + m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); + + if (op.padding_explicit()) { + m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / + static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / + static_cast(m_col_strides)); + m_rowPaddingTop = op.padding_top(); + m_colPaddingLeft = op.padding_left(); + } else { + // Computing padding from the type + switch (op.padding_type()) { + case PADDING_VALID: + m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = + numext::maxi(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2); + m_colPaddingLeft = + numext::maxi(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2); + break; + case PADDING_SAME: + m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); + m_outputCols = numext::ceil(m_input_cols_eff / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + // The padding size calculation for PADDING_SAME has been updated to + // be consistent with how TensorFlow extracts its paddings. + m_rowPaddingTop = numext::maxi(0, m_rowPaddingTop); + m_colPaddingLeft = numext::maxi(0, m_colPaddingLeft); + break; + default: + eigen_assert(false && "unexpected padding"); + m_outputCols = 0; // silence the uninitialised warning; + m_outputRows = 0; //// silence the uninitialised warning; + } + } + eigen_assert(m_outputRows > 0); + eigen_assert(m_outputCols > 0); + + // Dimensions for result of extraction. + if (static_cast(Layout) == static_cast(ColMajor)) { + // ColMajor + // 0: depth + // 1: patch_rows + // 2: patch_cols + // 3: number of patches + // 4 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_rows(); + m_dimensions[2] = op.patch_cols(); + m_dimensions[3] = m_outputRows * m_outputCols; + for (int i = 4; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i - 1]; + } + } else { + // RowMajor + // NumDims-1: depth + // NumDims-2: patch_rows + // NumDims-3: patch_cols + // NumDims-4: number of patches + // NumDims-5 and beyond: anything else (such as batch). + m_dimensions[NumDims - 1] = input_dims[NumInputDims - 1]; + m_dimensions[NumDims - 2] = op.patch_rows(); + m_dimensions[NumDims - 3] = op.patch_cols(); + m_dimensions[NumDims - 4] = m_outputRows * m_outputCols; + for (int i = NumDims - 5; i >= 0; --i) { + m_dimensions[i] = input_dims[i]; + } + } + + // Strides for moving the patch in various dimensions. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_colStride = m_dimensions[1]; + m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[3]; + } else { + m_colStride = m_dimensions[NumDims - 2]; + m_patchStride = m_colStride * m_dimensions[NumDims - 3] * m_dimensions[NumDims - 1]; + m_otherStride = m_patchStride * m_dimensions[NumDims - 4]; + } + + // Strides for navigating through the input tensor. + m_rowInputStride = m_inputDepth; + m_colInputStride = m_inputDepth * m_inputRows; + m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastInflateRowStride = internal::TensorIntDivisor(m_row_inflate_strides); + m_fastInflateColStride = internal::TensorIntDivisor(m_col_inflate_strides); + m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); + + // Number of patches in the width dimension. + m_fastOutputRows = internal::TensorIntDivisor(m_outputRows); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[0]); + } else { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims - 1]); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; + + // Other ways to index this element. + const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + // Calculate col index in the input original tensor. + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = + (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate row index in the original input tensor. + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = + (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + + const Index inputIndex = + depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { + return packetWithPossibleZero(index); + } + + const Index indices[2] = {index, index + PacketSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + + const Index patch2DIndex = + (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft, + colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride, + patchOffsets[1] - colOffsets[1] * m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop, + rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { + // no padding + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + const Index inputIndex = + depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.template packet(inputIndex); + } + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We conservatively estimate the cost for the code path where the computed + // index is inside the original image and + // TensorEvaluator::CoordAccess is false. + const double compute_cost = + 3 * TensorOpCost::DivCost() + 6 * TensorOpCost::MulCost() + 8 * TensorOpCost::MulCost(); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + + Index m_otherStride; + Index m_patchStride; + Index m_colStride; + Index m_row_strides; + Index m_col_strides; + + Index m_in_row_strides; + Index m_in_col_strides; + Index m_row_inflate_strides; + Index m_col_inflate_strides; + + Index m_input_rows_eff; + Index m_input_cols_eff; + Index m_patch_rows_eff; + Index m_patch_cols_eff; + + internal::TensorIntDivisor m_fastOtherStride; + internal::TensorIntDivisor m_fastPatchStride; + internal::TensorIntDivisor m_fastColStride; + internal::TensorIntDivisor m_fastInflateRowStride; + internal::TensorIntDivisor m_fastInflateColStride; + internal::TensorIntDivisor m_fastInputColsEff; + + Index m_rowInputStride; + Index m_colInputStride; + Index m_patchInputStride; + + Index m_inputDepth; + Index m_inputRows; + Index m_inputCols; + + Index m_outputRows; + Index m_outputCols; + + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + internal::TensorIntDivisor m_fastOutputRows; + internal::TensorIntDivisor m_fastOutputDepth; + + Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h new file mode 100644 index 00000000000..69ab6840c82 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -0,0 +1,620 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \internal + * + * \class TensorIndexList + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode a set of Tensor dimensions/indices. + * + * The indices in the list can be known at compile time or at runtime. A mix + * of static and dynamic indices can also be provided if needed. The tensor + * code will attempt to take advantage of the indices that are known at + * compile time to optimize the code it generates. + * + * This functionality requires a c++11 compliant compiler. If your compiler + * is older you need to use arrays of indices instead. + * + * Several examples are provided in the cxx11_tensor_index_list.cpp file. + * + * \sa Tensor + */ + +template +struct type2index { + static constexpr Index value = n; + EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; } + EIGEN_DEVICE_FUNC void set(Index val) { eigen_assert(val == n); } +}; + +// This can be used with IndexPairList to get compile-time constant pairs, +// such as IndexPairList, type2indexpair<3,4>>(). +template +struct type2indexpair { + static constexpr Index first = f; + static constexpr Index second = s; + + constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { return IndexPair(f, s); } + + EIGEN_DEVICE_FUNC void set(const IndexPair& val) { + eigen_assert(val.first == f); + eigen_assert(val.second == s); + } +}; + +template +struct NumTraits> { + typedef Index Real; + enum { IsComplex = 0, RequireInitialization = false, ReadCost = 1, AddCost = 1, MulCost = 1 }; + + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) { + val = internal::convert_index(new_val); +} +template +EIGEN_DEVICE_FUNC void update_value(type2index& val, Index new_val) { + val.set(new_val); +} + +template +EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { + val = new_val; +} +template +EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { + val.set(new_val); +} + +template +struct is_compile_time_constant { + static constexpr bool value = false; +}; + +template +struct is_compile_time_constant> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant&> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant&> { + static constexpr bool value = true; +}; + +template +struct is_compile_time_constant> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant&> { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant&> { + static constexpr bool value = true; +}; + +template +struct IndexTuple; + +template +struct IndexTuple { + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() {} + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) {} + + static constexpr int count = 1 + sizeof...(O); + T head; + IndexTuple others; + typedef T Head; + typedef IndexTuple Other; +}; + +template +struct IndexTuple { + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() {} + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) {} + + constexpr static int count = 1; + T head; + typedef T Head; +}; + +template +struct IndexTupleExtractor; + +template +struct IndexTupleExtractor { + typedef typename IndexTupleExtractor::ValType ValType; + + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { + return IndexTupleExtractor::get_val(val.others); + } + + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { + return IndexTupleExtractor::get_val(val.others); + } + template + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { + IndexTupleExtractor::set_val(val.others, new_val); + } +}; + +template +struct IndexTupleExtractor<0, T, O...> { + typedef T ValType; + + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { return val.head; } + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { return val.head; } + template + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { + val.head = new_val; + } +}; + +template +EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor::ValType& array_get(IndexTuple& tuple) { + return IndexTupleExtractor::get_val(tuple); +} +template +EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor::ValType& array_get( + const IndexTuple& tuple) { + return IndexTupleExtractor::get_val(tuple); +} +template +struct array_size> { + static constexpr size_t value = IndexTuple::count; +}; +template +struct array_size> { + static constexpr size_t value = IndexTuple::count; +}; + +template +struct tuple_coeff { + template + EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple& t) { + // return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); + return (i == Idx ? array_get(t) : tuple_coeff::get(i, t)); + } + template + EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT& value) { + if (i == Idx) { + update_value(array_get(t), value); + } else { + tuple_coeff::set(i, t, value); + } + } + + template + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple& t) { + return ((i == Idx) && is_compile_time_constant::ValType>::value) || + tuple_coeff::value_known_statically(i, t); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple& t) { + return is_compile_time_constant::ValType>::value && + tuple_coeff::values_up_to_known_statically(t); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple& t) { + return is_compile_time_constant::ValType>::value && + is_compile_time_constant::ValType>::value && + array_get(t) > array_get(t) && + tuple_coeff::values_up_to_statically_known_to_increase(t); + } +}; + +template +struct tuple_coeff<0, ValueT> { + template + EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return array_get<0>(t) /* * (i == 0)*/; + } + template + EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT value) { + eigen_assert(i == 0); + update_value(array_get<0>(t), value); + } + template + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple&) { + return is_compile_time_constant::ValType>::value && (i == 0); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple&) { + return is_compile_time_constant::ValType>::value; + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple&) { + return true; + } +}; +} // namespace internal + +template +struct IndexList : internal::IndexTuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[](const Index i) const { + return internal::tuple_coeff>::value - 1, + Index>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const { + return internal::tuple_coeff>::value - 1, + Index>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) { + return internal::tuple_coeff>::value - 1, + Index>::set(i, *this, value); + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr std::size_t size() const { return 1 + sizeof...(OtherTypes); }; + + EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) + : internal::IndexTuple(other) {} + EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) + : internal::IndexTuple(first, other...) {} + EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() {} + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { + return internal::tuple_coeff>::value - 1, + Index>::value_known_statically(i, *this); + } + EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { + return internal::tuple_coeff>::value - 1, + Index>::values_up_to_known_statically(*this); + } + + EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { + return internal::tuple_coeff>::value - 1, + Index>::values_up_to_statically_known_to_increase(*this); + } +}; + +template +std::ostream& operator<<(std::ostream& os, const IndexList& dims) { + os << "["; + for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} + +template +constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { + return IndexList(val1, other_vals...); +} + +template +struct IndexPairList : internal::IndexTuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[](const Index i) const { + return internal::tuple_coeff>::value - 1, + IndexPair>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair value) { + return internal::tuple_coeff>::value - 1, + IndexPair>::set(i, *this, value); + } + + EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple& other) + : internal::IndexTuple(other) {} + EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple() {} + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { + return internal::tuple_coeff>::value - 1, + Index>::value_known_statically(i, *this); + } +}; + +namespace internal { + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList& sizes) { + Index result = 1; + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < array_size>::value; ++i) { + result *= sizes[i]; + } + return result; +} + +template +struct array_size> { + static const size_t value = array_size>::value; +}; +template +struct array_size> { + static const size_t value = array_size>::value; +}; + +template +struct array_size> { + static const size_t value = 1 + sizeof...(OtherTypes); +}; +template +struct array_size> { + static const size_t value = 1 + sizeof...(OtherTypes); +}; + +template +EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList& a) { + return IndexTupleExtractor::get_val(a); +} +template +EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList& a) { + return IndexTupleExtractor::get_val(a); +} + +template +struct index_known_statically_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index) { return false; } +}; + +template +struct index_known_statically_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { + return IndexList().value_known_statically(i); + } +}; + +template +struct index_known_statically_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { + return IndexList().value_known_statically(i); + } +}; + +template +struct all_indices_known_statically_impl { + static constexpr bool run() { return false; } +}; + +template +struct all_indices_known_statically_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return IndexList().all_values_known_statically(); + } +}; + +template +struct all_indices_known_statically_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return IndexList().all_values_known_statically(); + } +}; + +template +struct indices_statically_known_to_increase_impl { + EIGEN_DEVICE_FUNC static constexpr bool run() { return false; } +}; + +template +struct indices_statically_known_to_increase_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return Eigen::IndexList().values_statically_known_to_increase(); + } +}; + +template +struct indices_statically_known_to_increase_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return Eigen::IndexList().values_statically_known_to_increase(); + } +}; + +template +struct index_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) == value); + } +}; + +template +struct index_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) == value); + } +}; + +template +struct index_statically_ne_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_statically_ne_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) != value); + } +}; + +template +struct index_statically_ne_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) != value); + } +}; + +template +struct index_statically_gt_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_statically_gt_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) > value); + } +}; + +template +struct index_statically_gt_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) > value); + } +}; + +template +struct index_statically_lt_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_statically_lt_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) < value); + } +}; + +template +struct index_statically_lt_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) && + (IndexList().get(i) < value); + } +}; + +template +struct index_pair_first_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_pair_first_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) && + (IndexPairList().operator[](i).first == value); + } +}; + +template +struct index_pair_first_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) && + (IndexPairList().operator[](i).first == value); + } +}; + +template +struct index_pair_second_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; } +}; + +template +struct index_pair_second_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) && + (IndexPairList().operator[](i).second == value); + } +}; + +template +struct index_pair_second_statically_eq_impl> { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) && + (IndexPairList().operator[](i).second == value); + } +}; + +} // end namespace internal +} // end namespace Eigen + +namespace Eigen { +namespace internal { +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) { + return index_known_statically_impl::run(i); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() { + return all_indices_known_statically_impl::run(); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() { + return indices_statically_known_to_increase_impl::run(); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) { + return index_statically_eq_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) { + return index_statically_ne_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) { + return index_statically_gt_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) { + return index_statically_lt_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) { + return index_pair_first_statically_eq_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) { + return index_pair_second_statically_eq_impl::run(i, value); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h new file mode 100644 index 00000000000..5b0fca8b66a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -0,0 +1,226 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Ke Yang +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorInflation + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor inflation class. + * + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorInflationOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorInflationOp type; +}; + +} // end namespace internal + +template +class TensorInflationOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides) + : m_xpr(expr), m_strides(strides) {} + + EIGEN_DEVICE_FUNC const Strides& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Strides m_strides; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorInflationOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_strides(op.strides()) { + m_dimensions = m_impl.dimensions(); + // Expand each dimension to the inflated dimension. + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1; + } + + // Remember the strides for fast division. + for (int i = 0; i < NumDims; ++i) { + m_fastStrides[i] = internal::TensorIntDivisor(m_strides[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { // RowMajor + m_outputStrides[NumDims - 1] = 1; + m_inputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + // Computes the input index given the output index. Returns true if the output + // index doesn't fall into a hole. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const { + eigen_assert(index < dimensions().TotalSize()); + *inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[0] * m_strides[0]) { + return false; + } + *inputIndex += index / m_strides[0]; + return true; + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[NumDims - 1] * m_strides[NumDims - 1]) { + return false; + } + *inputIndex += index / m_strides[NumDims - 1]; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + Index inputIndex = 0; + if (getInputIndex(index, &inputIndex)) { + return m_impl.coeff(inputIndex); + } else { + return Scalar(0); + } + } + + // TODO(yangke): optimize this function so that we can detect and produce + // all-zero packets + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (3 * TensorOpCost::DivCost() + 3 * TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + const double input_size = m_impl.dimensions().TotalSize(); + const double output_size = m_dimensions.TotalSize(); + if (output_size == 0) return TensorOpCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Strides m_strides; + array, NumDims> m_fastStrides; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h new file mode 100644 index 00000000000..85cf492a001 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -0,0 +1,78 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H +#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H + +#include + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorInitializer + * \ingroup CXX11_Tensor_Module + * + * \brief Helper template to initialize Tensors from std::initializer_lists. + */ +namespace internal { + +template +struct Initializer { + typedef std::initializer_list::InitList> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + for (const auto& v : vals) { + (*indices)[traits::NumDimensions - N] = i++; + Initializer::run(tensor, indices, v); + } + } +}; + +template +struct Initializer { + typedef std::initializer_list::Scalar> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + // There is likely a faster way to do that than iterating. + for (const auto& v : vals) { + (*indices)[traits::NumDimensions - 1] = i++; + tensor.coeffRef(*indices) = v; + } + } +}; + +template +struct Initializer { + typedef typename traits::Scalar InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>*, const InitList& v) { + tensor.coeffRef(0) = v; + } +}; + +template +void initialize_tensor(TensorEvaluator& tensor, + const typename Initializer::NumDimensions>::InitList& vals) { + Eigen::array::Index, traits::NumDimensions> indices; + Initializer::NumDimensions>::run(tensor, &indices, vals); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h new file mode 100644 index 00000000000..fddc64822d0 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -0,0 +1,261 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H +#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \internal + * + * \class TensorIntDiv + * \ingroup CXX11_Tensor_Module + * + * \brief Fast integer division by a constant. + * + * See the paper from Granlund and Montgomery for explanation. + * (at https://doi.org/10.1145/773473.178249) + * + * \sa Tensor + */ + +namespace internal { + +// Note: result is undefined if val == 0 +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t count_leading_zeros(const T val) { +#ifdef EIGEN_GPU_COMPILE_PHASE + return __clz(val); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::clz(val); +#elif EIGEN_COMP_MSVC + unsigned long index; + _BitScanReverse(&index, val); + return 31 - index; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clz(static_cast(val)); +#endif +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t count_leading_zeros(const T val) { +#ifdef EIGEN_GPU_COMPILE_PHASE + return __clzll(val); +#elif defined(SYCL_DEVICE_ONLY) + return static_cast(cl::sycl::clz(val)); +#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 + unsigned long index; + _BitScanReverse64(&index, val); + return 63 - index; +#elif EIGEN_COMP_MSVC + // MSVC's _BitScanReverse64 is not available for 32bits builds. + unsigned int lo = (unsigned int)(val & 0xffffffff); + unsigned int hi = (unsigned int)((val >> 32) & 0xffffffff); + int n; + if (hi == 0) + n = 32 + count_leading_zeros(lo); + else + n = count_leading_zeros(hi); + return n; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clzll(static_cast(val)); +#endif +} + +template +struct UnsignedTraits { + typedef std::conditional_t type; +}; + +template +struct DividerTraits { + typedef typename UnsignedTraits::type type; + static constexpr int N = sizeof(T) * 8; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __umulhi(a, b); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::mul_hi(a, static_cast(b)); +#else + return (static_cast(a) * b) >> 32; +#endif +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __umul64hi(a, b); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::mul_hi(a, static_cast(b)); +#elif EIGEN_COMP_MSVC && (EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64) + return __umulh(a, static_cast(b)); +#elif EIGEN_HAS_BUILTIN_INT128 + __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); + return static_cast(v >> 64); +#else + return (TensorUInt128, uint64_t>(a) * TensorUInt128, uint64_t>(b)).upper(); +#endif +} + +template +struct DividerHelper { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) { + EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); + return static_cast((static_cast(1) << (N + log_div)) / divider - + (static_cast(1) << N) + 1); + } +}; + +template +struct DividerHelper<64, T> { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { +#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) + return static_cast((static_cast<__uint128_t>(1) << (64 + log_div)) / static_cast<__uint128_t>(divider) - + (static_cast<__uint128_t>(1) << 64) + 1); +#else + const uint64_t shift = 1ULL << log_div; + TensorUInt128 result = + TensorUInt128 >(shift, 0) / TensorUInt128, uint64_t>(divider) - + TensorUInt128, static_val<0> >(1, 0) + TensorUInt128, static_val<1> >(1); + return static_cast(result); +#endif + } +}; + +template +struct TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + multiplier = 0; + shift1 = 0; + shift2 = 0; + } + + // Must have 0 < divider < 2^31. This is relaxed to + // 0 < divider < 2^63 when using 64-bit indices on platforms that support + // the __uint128_t type. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = DividerTraits::N; + eigen_assert(static_cast::type>(divider) < NumTraits::highest() / 2); + eigen_assert(divider > 0); + + // fast ln2 + const int leading_zeros = count_leading_zeros(static_cast(divider)); + int log_div = N - leading_zeros; + // if divider is a power of two then log_div is 1 more than it should be. + if ((static_cast::type>(1) << (log_div - 1)) == + static_cast::type>(divider)) + log_div--; + + multiplier = DividerHelper::computeMultiplier(log_div, divider); + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div - 1 : 0; + } + + // Must have 0 <= numerator. On platforms that don't support the __uint128_t + // type numerator should also be less than 2^32-1. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + eigen_assert(static_cast::type>(numerator) < NumTraits::highest() / 2); + // eigen_assert(numerator >= 0); // this is implicitly asserted by the line above + + UnsignedType t1 = muluh(multiplier, numerator); + UnsignedType t = (static_cast(numerator) - t1) >> shift1; + return (t1 + t) >> shift2; + } + + private: + typedef typename DividerTraits::type UnsignedType; + UnsignedType multiplier; + int32_t shift1; + int32_t shift2; +}; + +// Optimized version for signed 32 bit integers. +// Derived from Hacker's Delight. +// Only works for divisors strictly greater than one +template <> +class TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + magic = 0; + shift = 0; + } + // Must have 2 <= divider + EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) { + eigen_assert(divider >= 2); + calcMagic(divider); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { +#ifdef EIGEN_GPU_COMPILE_PHASE + return (__umulhi(magic, n) >> shift); +#elif defined(SYCL_DEVICE_ONLY) + return (cl::sycl::mul_hi(magic, static_cast(n)) >> shift); +#else + uint64_t v = static_cast(magic) * static_cast(n); + return (static_cast(v >> 32) >> shift); +#endif + } + + private: + // Compute the magic numbers. See Hacker's Delight section 10 for an in + // depth explanation. + EIGEN_DEVICE_FUNC void calcMagic(int32_t d) { + const unsigned two31 = 0x80000000; // 2**31. + unsigned ad = d; + unsigned t = two31 + (ad >> 31); + unsigned anc = t - 1 - t % ad; // Absolute value of nc. + int p = 31; // Init. p. + unsigned q1 = two31 / anc; // Init. q1 = 2**p/|nc|. + unsigned r1 = two31 - q1 * anc; // Init. r1 = rem(2**p, |nc|). + unsigned q2 = two31 / ad; // Init. q2 = 2**p/|d|. + unsigned r2 = two31 - q2 * ad; // Init. r2 = rem(2**p, |d|). + unsigned delta = 0; + do { + p = p + 1; + q1 = 2 * q1; // Update q1 = 2**p/|nc|. + r1 = 2 * r1; // Update r1 = rem(2**p, |nc|). + if (r1 >= anc) { // (Must be an unsigned + q1 = q1 + 1; // comparison here). + r1 = r1 - anc; + } + q2 = 2 * q2; // Update q2 = 2**p/|d|. + r2 = 2 * r2; // Update r2 = rem(2**p, |d|). + if (r2 >= ad) { // (Must be an unsigned + q2 = q2 + 1; // comparison here). + r2 = r2 - ad; + } + delta = ad - r2; + } while (q1 < delta || (q1 == delta && r1 == 0)); + + magic = (unsigned)(q2 + 1); + shift = p - 32; + } + + uint32_t magic; + int32_t shift; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator/(const T& numerator, const TensorIntDivisor& divisor) { + return divisor.divide(numerator); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h new file mode 100644 index 00000000000..43f4bcb4656 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -0,0 +1,185 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorLayoutSwap + * \ingroup CXX11_Tensor_Module + * + * \brief Swap the layout from col-major to row-major, or row-major + * to col-major, and invert the order of the dimensions. + * + * Beware: the dimensions are reversed by this operation. If you want to + * preserve the ordering of the dimensions, you need to combine this + * operation with a shuffle. + * + * \example: + * Tensor input(2, 4); + * Tensor output = input.swap_layout(); + * eigen_assert(output.dimension(0) == 4); + * eigen_assert(output.dimension(1) == 2); + * + * array shuffle(1, 0); + * output = input.swap_layout().shuffle(shuffle); + * eigen_assert(output.dimension(0) == 2); + * eigen_assert(output.dimension(1) == 4); + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = traits::NumDimensions; + static constexpr int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorLayoutSwapOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorLayoutSwapOp type; +}; + +} // end namespace internal + +template +class TensorLayoutSwapOp : public TensorBase, WriteAccessors> { + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp) + protected: + typename XprType::Nested m_xpr; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorLayoutSwapOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + static constexpr int Layout = + (TensorEvaluator::Layout == static_cast(ColMajor)) ? RowMajor : ColMajor; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = m_impl.dimensions()[NumDims - 1 - i]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_impl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); } + + const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + Dimensions m_dimensions; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorLayoutSwapOp XprType; + + static constexpr int Layout = + (TensorEvaluator::Layout == static_cast(ColMajor)) ? RowMajor : ColMajor; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false // to be implemented + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + return this->m_impl.coeffRef(index); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + this->m_impl.template writePacket(index, x); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h new file mode 100644 index 00000000000..f8bbcfee74a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -0,0 +1,85 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H + +/** use this macro in sfinae selection in templated functions + * + * template::value , int > = 0 + * > + * void foo(){} + * + * becomes => + * + * template::value ) + * > + * void foo(){} + */ + +#define EIGEN_SFINAE_ENABLE_IF(__condition__) std::enable_if_t<(__condition__), int> = 0 + +// Define a macro to use a reference on the host but a value on the device +#if defined(SYCL_DEVICE_ONLY) +#define EIGEN_DEVICE_REF +#else +#define EIGEN_DEVICE_REF & +#endif + +// Define a macro for catching SYCL exceptions if exceptions are enabled +#define EIGEN_SYCL_TRY_CATCH(X) \ + do { \ + EIGEN_TRY { X; } \ + EIGEN_CATCH(const cl::sycl::exception& e) { \ + EIGEN_THROW_X(std::runtime_error("SYCL exception at " + std::string(__FILE__) + ":" + std::to_string(__LINE__) + \ + "\n" + e.what())); \ + } \ + } while (false) + +// Define a macro if local memory flags are unset or one of them is set +// Setting both flags is the same as unsetting them +#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \ + (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)) +#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 +#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) +#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 +#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) +#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#endif + +#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) +#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + using Base::operator=; \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { \ + Base::operator=(other); \ + return *this; \ + } \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { \ + Base::operator=(other); \ + return *this; \ + } +#else +#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) +#endif + +/** \internal + * \brief Macro to manually inherit assignment operators. + * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is + * defined. This also inherits template operator=(const OtherDerived&) assignments. With C++11 or later + * this also default-implements the copy-constructor + */ +#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived) \ + EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived) + +#endif diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h new file mode 100644 index 00000000000..191413b7d03 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -0,0 +1,191 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) + +/** \class TensorMap + * \ingroup CXX11_Tensor_Module + * + * \brief A tensor expression mapping an existing array of data. + * + */ +/// `template class MakePointer_` is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler `T*` is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. +/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . +/// Therefore, by adding the default value, we managed to convert the type and it does not break any +/// existing code as its default value is `T*`. +template class MakePointer_> +class TensorMap : public TensorBase > { + public: + typedef TensorMap Self; + typedef TensorBase > Base; +#ifdef EIGEN_USE_SYCL + typedef std::remove_reference_t::type> Nested; +#else + typedef typename Eigen::internal::nested::type Nested; +#endif + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType; + + typedef typename MakePointer_::Type PointerType; + typedef typename MakePointer_::ConstType PointerConstType; + + // WARN: PointerType still can be a pointer to const (const Scalar*), for + // example in TensorMap> expression. This type of + // expression should be illegal, but adding this restriction is not possible + // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488). + typedef std::conditional_t::value), + PointerType, // use simple pointer in lvalue expressions + PointerConstType // use const pointer in rvalue expressions + > + StoragePointerType; + + // If TensorMap was constructed over rvalue expression (e.g. const Tensor), + // we should return a reference to const from operator() (and others), even + // if TensorMap itself is not const. + typedef std::conditional_t::value), Scalar&, const Scalar&> StorageRefType; + + static constexpr int Options = Options_; + + static constexpr Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + static constexpr int Layout = PlainObjectType::Layout; + enum { IsAligned = ((int(Options_) & Aligned) == Aligned), CoordAccess = true, RawAccess = true }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, + IndexTypes... otherDimensions) + : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), + YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, + const array& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor) + : m_data(tensor.data()), m_dimensions(tensor.dimensions()) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) const { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options & RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()() const { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_data[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, + IndexTypes... otherIndices) const { + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); + if (PlainObjectType::Options & RowMajor) { + const Index index = + m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = + m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options & RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()() { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_data[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index index) { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, + IndexTypes... otherIndices) { + static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, + "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); + const std::size_t NumDims = sizeof...(otherIndices) + 2; + if (PlainObjectType::Options & RowMajor) { + const Index index = + m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = + m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } + } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap) + + private: + StoragePointerType m_data; + Dimensions m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h new file mode 100644 index 00000000000..8c2bb2ef9e2 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -0,0 +1,292 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +template +struct Cond {}; + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& choose(Cond, const T1& first, const T2&) { + return first; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T2& choose(Cond, const T1&, const T2& second) { + return second; +} + +template +struct max_n_1 { + static const size_t size = n; +}; +template <> +struct max_n_1<0> { + static const size_t size = 1; +}; + +template +EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T divup(const T x, const T y) { + return Eigen::numext::div_ceil(x, y); +} + +// Default packet types +template +struct PacketType : internal::packet_traits { + typedef typename internal::packet_traits::type type; +}; + +// For CUDA packet types when using a GpuDevice +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE) + +typedef ulonglong2 Packet4h2; +template <> +struct PacketType { + typedef Packet4h2 type; + static const int size = 8; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasExpm1 = 0, + HasLog = 1, + HasLog1p = 0, + HasLog10 = 0, + HasPow = 1, + }; +}; +#endif + +#if defined(EIGEN_USE_SYCL) + +namespace TensorSycl { +namespace internal { + +template +struct PlusOp { + static constexpr Index Value = A + B; +}; + +template +struct DivOp { + static constexpr Index Value = A / B; +}; + +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) { + op(start); + static_for::Value, end, step, StepOp>::loop(op); + } +}; +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {} +}; + +template +struct Vectorise { + static constexpr int PacketSize = 1; + typedef OutScalar PacketReturnType; +}; + +template +struct Vectorise { + static constexpr int PacketSize = Eigen::PacketType::size; + typedef typename Eigen::PacketType::type PacketReturnType; +}; + +static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { return ((((x) + (y)-1) / (y)) * (y)); } + +} // namespace internal +} // namespace TensorSycl + +template <> +struct PacketType { + typedef half type; + static const int size = 1; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0 + }; +}; +template +struct PacketType : internal::default_packet_traits { + typedef Scalar type; + typedef Scalar half; + enum { + Vectorizable = 0, + size = 1, + AlignedOnScalar = 0, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 + }; +}; + +template +struct PacketType : PacketType {}; + +#ifndef EIGEN_DONT_VECTORIZE_SYCL +#define PACKET_TYPE(CVQual, Type, val, lengths, DEV) \ + template <> \ + struct PacketType : internal::sycl_packet_traits { \ + typedef typename internal::packet_traits::type type; \ + typedef typename internal::packet_traits::half half; \ + }; + +PACKET_TYPE(const, float, 1, 4, SyclDevice) +PACKET_TYPE(, float, 1, 4, SyclDevice) +PACKET_TYPE(const, float, 1, 4, const SyclDevice) +PACKET_TYPE(, float, 1, 4, const SyclDevice) + +PACKET_TYPE(const, double, 0, 2, SyclDevice) +PACKET_TYPE(, double, 0, 2, SyclDevice) +PACKET_TYPE(const, double, 0, 2, const SyclDevice) +PACKET_TYPE(, double, 0, 2, const SyclDevice) +#undef PACKET_TYPE + +template <> +struct PacketType : PacketType {}; +template <> +struct PacketType : PacketType {}; +#endif +#endif + +// Pair mimics std::pair but works on e.g. nvcc. +template +struct Pair { + public: + EIGEN_MAKE_ALIGNED_OPERATOR_NEW + + U first; + V second; + + typedef U first_type; + typedef V second_type; + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair() : first(), second() {} + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair(const U& f, const V& s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(Pair& rhs) { + using numext::swap; + swap(first, rhs.first); + swap(second, rhs.second); + } +}; + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const Pair& x, const Pair& y) { + return (x.first == y.first && x.second == y.second); +} + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const Pair& x, const Pair& y) { + return !(x == y); +} + +// Can't use std::pairs on cuda devices +template +struct IndexPair { + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC void set(IndexPair val) { + first = val.first; + second = val.second; + } + + Idx first; + Idx second; +}; + +namespace internal { + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array( + IndexType& idx, numeric_list) { + return {static_cast(idx[First]), static_cast(idx[Is])...}; +} +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType&, + numeric_list) { + return array(); +} + +/** Make an array (for index/dimensions) out of a custom index */ +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType& idx) { + return customIndices2Array(idx, typename gen_numeric_list::type{}); +} + +template +struct is_base_of { + typedef char (&yes)[1]; + typedef char (&no)[2]; + + template + struct Host { + operator BB*() const; + operator DD*(); + }; + + template + static yes check(D*, T); + static no check(B*, int); + + static const bool value = sizeof(check(Host(), int())) == sizeof(yes); +}; + +} // namespace internal + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h new file mode 100644 index 00000000000..d790f35bb9c --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -0,0 +1,984 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H +#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits> : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = array_size::value; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorReshapingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorReshapingOp type; +}; + +} // end namespace internal + +template +class TensorReshapingOp : public TensorBase, WriteAccessors> { + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC const NewDimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp) + + protected: + typename XprType::Nested m_xpr; + const NewDimensions m_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory, Device> ConstCastStorage; + + static constexpr int NumOutputDims = internal::array_size::value; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + + enum ReshapingKind { + // We do not use layout information to determine reshaping kind. + // Depending on the layout `N` can be inner or outer dimension. + OneByN = 0, // expr.reshape(1, N) + NByOne = 1, // expr.reshape(N, 1) + Runtime = 2 // Reshape dimensions are dynamic (specified at runtime). + }; + + // clang-format off + static const ReshapingKind kind = + (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/0, /*value=*/1)) ? OneByN + : (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/1, /*value=*/1)) ? NByOne + : Runtime; + // clang-format on + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + // For trivial reshapes with raw access to underlying data we will provide + // zero overhead block access. + // TODO(ezhulenev): Consider adding block access without raw access? + BlockAccess = TensorEvaluator::RawAccess && NumInputDims > 0 && NumOutputDims > 0, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { + // The total size of the reshaped tensor must be equal to the total size + // of the input tensor. + eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(data, std::move(done)); + } +#endif + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return m_impl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + // required in block(OutputTensorBlock* output_block) const + // For C++03 compatibility this must be defined outside the method + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_impl.data() != NULL); + eigen_assert((kind == Runtime) || (kind == OneByN && desc.dimensions()[0] == 1) || + (kind == NByOne && desc.dimensions()[1] == 1)); + + if (kind == OneByN || kind == NByOne) { + // We can guarantee at compile time that block is just a contiguous slice + // of the underlying expression memory buffer. + return TensorBlock(internal::TensorBlockKind::kView, m_impl.data() + desc.offset(), desc.dimensions()); + } else { + // This will do additional runtime checks, and in the end it might be also + // a view, or it might be a block materialized in the temporary buffer. + return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, scratch); + } + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); } + + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + NewDimensions m_dimensions; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> + +{ + typedef TensorEvaluator, Device> Base; + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + return this->m_impl.coeffRef(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + this->m_impl.template writePacket(index, x); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment + TensorBlockAssign; + + TensorBlockAssign::Run(TensorBlockAssign::target(desc.dimensions(), internal::strides(this->dimensions()), + this->m_impl.data(), desc.offset()), + block.expr()); + } +}; + +/** \class TensorSlicing + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor slicing class. + * + * + */ +namespace internal { +template +struct traits> : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = array_size::value; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorSlicingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorSlicingOp type; +}; + +} // end namespace internal + +template +class TensorSlicingOp : public TensorBase> { + public: + typedef TensorBase> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, + const Sizes& sizes) + : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} + + EIGEN_DEVICE_FUNC const StartIndices& startIndices() const { return m_indices; } + EIGEN_DEVICE_FUNC const Sizes& sizes() const { return m_sizes; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp) + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_indices; + const Sizes m_sizes; +}; + +namespace internal { + +// Fixme: figure out the exact threshold +template +struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) {} + EIGEN_DEVICE_FUNC bool operator()(Index total, Index contiguous) const { + const bool prefer_block_evaluation = BlockAccess && total > 32 * 1024; + return !prefer_block_evaluation && contiguous > threshold_; + } + + private: + Index threshold_; +}; + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_GPU +template +struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) {} + EIGEN_DEVICE_FUNC bool operator()(Index, Index contiguous) const { return contiguous > 4 * 1024 * 1024; } +}; +#endif + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_SYCL +template +struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) {} + EIGEN_DEVICE_FUNC bool operator()(Index, Index contiguous) const { return contiguous > 4 * 1024 * 1024; } +}; +#endif + +} // namespace internal + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorSlicingOp XprType; + static constexpr int NumDims = internal::array_size::value; + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + typedef StorageMemory Storage; + typedef StorageMemory, Device> ConstCastStorage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess && + // FIXME: Temporary workaround for bug in slicing of bool tensors. + !internal::is_same, bool>::value, + PreferBlockAccess = true, + CoordAccess = false, + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + // Tensor slicing does not change the block type. + typedef typename TensorEvaluator::TensorBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { + m_is_identity = true; + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + if (m_impl.dimensions()[i] != op.sizes()[i] || op.startIndices()[i] != 0) { + m_is_identity = false; + } + } + + // No strides for scalars. + if (NumDims == 0) return; + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Sizes& output_dims = op.sizes(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * output_dims[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } else { + m_inputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + + // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed. + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * output_dims[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (!NumTraits>::RequireInitialization && data && m_impl.data()) { + Index contiguous_values = 1; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } + // Use memcpy if it's going to be faster than using the regular evaluation. + const internal::MemcpyTriggerForSlicing trigger(m_device); + if (trigger(internal::array_prod(dimensions()), contiguous_values)) { + EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); + for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + Index offset = srcCoeff(i); + m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src + offset), + contiguous_values * sizeof(Scalar)); + } + return false; + } + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + const int packetSize = PacketType::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < internal::array_prod(dimensions())); + + if (m_is_identity) { + return m_impl.template packet(index); + } + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[NumDims - 1]); + inputIndices[1] += (indices[1] + m_offsets[NumDims - 1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize - 1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP + for (int i = 1; i < packetSize - 1; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset())); + TensorBlock block = m_impl.block(arg_desc, scratch); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + return block; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); + if (result) { + Index offset = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i + 1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i - 1; j >= 0; --j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } + return result + offset; + } + return NULL; + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[NumDims - 1]); + } + return inputIndex; + } + + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Device EIGEN_DEVICE_REF m_device; + Dimensions m_dimensions; + bool m_is_identity; + const StartIndices m_offsets; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorSlicingOp XprType; + static constexpr int NumDims = internal::array_size::value; + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + if (this->m_is_identity) { + this->m_impl.template writePacket(index, x); + return; + } + + const int packetSize = PacketType::size; + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[NumDims - 1]); + inputIndices[1] += (indices[1] + this->m_offsets[NumDims - 1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket(inputIndices[0], x); + } else { + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + internal::pstore(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize - 1]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < packetSize - 1; ++i) { + this->coeffRef(index + i) = values[i]; + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) { + TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset())); + this->m_impl.writeBlock(arg_desc, block); + } +}; + +namespace internal { +template +struct traits> : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = array_size::value; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorStridingSlicingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval>::type> { + typedef TensorStridingSlicingOp type; +}; + +} // end namespace internal + +template +class TensorStridingSlicingOp + : public TensorBase> { + public: + typedef TensorBase> Base; + typedef typename internal::traits::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(const XprType& expr, const StartIndices& startIndices, + const StopIndices& stopIndices, const Strides& strides) + : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), m_strides(strides) {} + + EIGEN_DEVICE_FUNC const StartIndices& startIndices() const { return m_startIndices; } + EIGEN_DEVICE_FUNC const StartIndices& stopIndices() const { return m_stopIndices; } + EIGEN_DEVICE_FUNC const StartIndices& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp) + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_startIndices; + const StopIndices m_stopIndices; + const Strides m_strides; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorStridingSlicingOp XprType; + static constexpr int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef Strides Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) { + // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero + DSizes startIndicesClamped, stopIndicesClamped; + for (ptrdiff_t i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); + if (m_strides[i] > 0) { + startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + } else { + /* implies m_strides[i] < 0 by assert */ + startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + } + m_startIndices[i] = startIndicesClamped[i]; + } + + typedef typename TensorEvaluator::Dimensions InputDimensions; + const InputDimensions& input_dims = m_impl.dimensions(); + + // compute output tensor shape + m_is_identity = true; + for (int i = 0; i < NumDims; i++) { + Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; + if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) { + m_dimensions[i] = 0; + } else { + m_dimensions[i] = (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0); + eigen_assert(m_dimensions[i] >= 0); + } + if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) { + m_is_identity = false; + } + } + + Strides output_dims = m_dimensions; + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = m_strides[0]; + m_offsets[0] = startIndicesClamped[0]; + Index previousDimProduct = 1; + for (int i = 1; i < NumDims; ++i) { + previousDimProduct *= input_dims[i - 1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * output_dims[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } else { + m_inputStrides[NumDims - 1] = m_strides[NumDims - 1]; + m_offsets[NumDims - 1] = startIndicesClamped[NumDims - 1]; + Index previousDimProduct = 1; + for (int i = NumDims - 2; i >= 0; --i) { + previousDimProduct *= input_dims[i + 1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * output_dims[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } + return inputIndex; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { +#ifndef SYCL_DEVICE_ONLY + return numext::maxi(min, numext::mini(max, value)); +#else + return cl::sycl::clamp(value, min, max); +#endif + } + + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + bool m_is_identity; + TensorEvaluator m_impl; + const Device EIGEN_DEVICE_REF m_device; + DSizes m_startIndices; // clamped startIndices + DSizes m_dimensions; + DSizes m_offsets; // offset in a flattened shape + const Strides m_strides; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorStridingSlicingOp XprType; + static constexpr int NumDims = internal::array_size::value; + static constexpr int Layout = TensorEvaluator::Layout; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = TensorEvaluator::CoordAccess, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Strides Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h new file mode 100644 index 00000000000..0e8f54a8052 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -0,0 +1,620 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorPadding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor padding class. + * At the moment only padding with a constant value is supported. + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorPaddingOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorPaddingOp type; +}; + +} // end namespace internal + +template +class TensorPaddingOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, + const Scalar padding_value) + : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC const PaddingDimensions& padding() const { return m_padding_dims; } + EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PaddingDimensions m_padding_dims; + const Scalar m_padding_value; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorPaddingOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + CoordAccess = true, + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) { + // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead + // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector + // of 1 element first and then pad. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Compute dimensions + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] += m_padding[i].first + m_padding[i].second; + } + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + m_outputStrides[NumDims] = m_outputStrides[NumDims - 1] * m_dimensions[NumDims - 1]; + } else { + m_inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i + 1] = m_outputStrides[i + 2] * m_dimensions[i + 1]; + } + m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (isPaddingAtIndexForDim(idx, i)) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (isPaddingAtIndexForDim(index, 0)) { + return m_paddingValue; + } + inputIndex += (index - m_padding[0].first); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i + 1]; + if (isPaddingAtIndexForDim(idx, i)) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i + 1]; + } + if (isPaddingAtIndexForDim(index, NumDims - 1)) { + return m_paddingValue; + } + inputIndex += (index - m_padding[NumDims - 1].first); + } + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + if (static_cast(Layout) == static_cast(ColMajor)) { + return packetColMajor(index); + } + return packetRowMajor(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + TensorOpCost cost = m_impl.costPerCoeff(vectorized); + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims; ++i) updateCostPerDimension(cost, i, i == 0); + } else { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i >= 0; --i) updateCostPerDimension(cost, i, i == NumDims - 1); + } + return cost; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // If one of the dimensions is zero, return empty block view. + if (desc.size() == 0) { + return TensorBlock(internal::TensorBlockKind::kView, NULL, desc.dimensions()); + } + + static const bool IsColMajor = Layout == static_cast(ColMajor); + const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; + + Index offset = desc.offset(); + + // Compute offsets in the output tensor corresponding to the desc.offset(). + DSizes output_offsets; + for (int i = NumDims - 1; i > 0; --i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + const int stride_dim = IsColMajor ? dim : dim + 1; + output_offsets[dim] = offset / m_outputStrides[stride_dim]; + offset -= output_offsets[dim] * m_outputStrides[stride_dim]; + } + output_offsets[inner_dim_idx] = offset; + + // Offsets in the input corresponding to output offsets. + DSizes input_offsets = output_offsets; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; + } + + // Compute offset in the input buffer (at this point it might be illegal and + // point outside of the input buffer, because we don't check for negative + // offsets, it will be autocorrected in the block iteration loop below). + Index input_offset = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offset += input_offsets[dim] * m_inputStrides[dim]; + } + + // Destination buffer and scratch buffer both indexed from 0 and have the + // same dimensions as the requested block (for destination buffer this + // property is guaranteed by `desc.destination()`). + Index output_offset = 0; + const DSizes output_strides = internal::strides(desc.dimensions()); + + // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` + // dimensions, skipping innermost dimension. In theory it should be possible + // to squeeze matching innermost dimensions, however in practice that did + // not show any improvements in benchmarks. Also in practice first outer + // dimension usually has padding, and will prevent squeezing. + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims - 1; ++i) { + const int dim = IsColMajor ? i + 1 : NumDims - i - 2; + it[i].count = 0; + it[i].size = desc.dimension(dim); + + it[i].input_stride = m_inputStrides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + it[i].output_stride = output_strides[dim]; + it[i].output_span = it[i].output_stride * (it[i].size - 1); + } + + const Index input_inner_dim_size = static_cast(m_impl.dimensions()[inner_dim_idx]); + + // Total output size. + const Index output_size = desc.size(); + + // We will fill inner dimension of this size in the output. It might be + // larger than the inner dimension in the input, so we might have to pad + // before/after we copy values from the input inner dimension. + const Index output_inner_dim_size = desc.dimension(inner_dim_idx); + + // How many values to fill with padding BEFORE reading from the input inner + // dimension. + const Index output_inner_pad_before_size = + input_offsets[inner_dim_idx] < 0 + ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), output_inner_dim_size) + : 0; + + // How many values we can actually copy from the input inner dimension. + const Index output_inner_copy_size = numext::mini( + // Want to copy from input. + (output_inner_dim_size - output_inner_pad_before_size), + // Can copy from input. + numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] + output_inner_pad_before_size), Index(0))); + + eigen_assert(output_inner_copy_size >= 0); + + // How many values to fill with padding AFTER reading from the input inner + // dimension. + const Index output_inner_pad_after_size = + (output_inner_dim_size - output_inner_copy_size - output_inner_pad_before_size); + + // Sanity check, sum of all sizes must be equal to the output size. + eigen_assert(output_inner_dim_size == + (output_inner_pad_before_size + output_inner_copy_size + output_inner_pad_after_size)); + + // Keep track of current coordinates and padding in the output. + DSizes output_coord = output_offsets; + DSizes output_padded; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + + typedef internal::StridedLinearBufferCopy LinCopy; + + // Prepare storage for the materialized padding result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + + // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a + // single logical inner dimension. + + // When possible we squeeze writes for the innermost (only if non-padded) + // dimension with the first padded dimension. This allows to reduce the + // number of calls to LinCopy and better utilize vector instructions. + const bool squeeze_writes = NumDims > 1 && + // inner dimension is not padded + (input_inner_dim_size == m_dimensions[inner_dim_idx]) && + // and equal to the block inner dimension + (input_inner_dim_size == output_inner_dim_size); + + const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1; + + // Maximum coordinate on a squeeze dimension that we can write to. + const Index squeeze_max_coord = + squeeze_writes ? numext::mini( + // max non-padded element in the input + static_cast(m_dimensions[squeeze_dim] - m_padding[squeeze_dim].second), + // max element in the output buffer + static_cast(output_offsets[squeeze_dim] + desc.dimension(squeeze_dim))) + : static_cast(0); + + // Iterate copying data from `m_impl.data()` to the output buffer. + for (Index size = 0; size < output_size;) { + // Detect if we are in the padded region (exclude innermost dimension). + bool is_padded = false; + for (int j = 1; j < NumDims; ++j) { + const int dim = IsColMajor ? j : NumDims - j - 1; + is_padded = output_padded[dim]; + if (is_padded) break; + } + + if (is_padded) { + // Fill single innermost dimension with padding value. + size += output_inner_dim_size; + + LinCopy::template Run(typename LinCopy::Dst(output_offset, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_dim_size); + + } else if (squeeze_writes) { + // Squeeze multiple reads from innermost dimensions. + const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim]; + size += output_inner_dim_size * squeeze_num; + + // Copy `squeeze_num` inner dimensions from input to output. + LinCopy::template Run(typename LinCopy::Dst(output_offset, 1, block_storage.data()), + typename LinCopy::Src(input_offset, 1, m_impl.data()), + output_inner_dim_size * squeeze_num); + + // Update iteration state for only `squeeze_num - 1` processed inner + // dimensions, because we have another iteration state update at the end + // of the loop that will update iteration state for the last inner + // processed dimension. + it[0].count += (squeeze_num - 1); + input_offset += it[0].input_stride * (squeeze_num - 1); + output_offset += it[0].output_stride * (squeeze_num - 1); + output_coord[squeeze_dim] += (squeeze_num - 1); + + } else { + // Single read from innermost dimension. + size += output_inner_dim_size; + + { // Fill with padding before copying from input inner dimension. + const Index out = output_offset; + + LinCopy::template Run(typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_before_size); + } + + { // Copy data from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size; + const Index in = input_offset + output_inner_pad_before_size; + + eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL); + + LinCopy::template Run(typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(in, 1, m_impl.data()), + output_inner_copy_size); + } + + { // Fill with padding after copying from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size + output_inner_copy_size; + + LinCopy::template Run(typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_after_size); + } + } + + for (int j = 0; j < NumDims - 1; ++j) { + const int dim = IsColMajor ? j + 1 : NumDims - j - 2; + + if (++it[j].count < it[j].size) { + input_offset += it[j].input_stride; + output_offset += it[j].output_stride; + output_coord[dim] += 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + break; + } + it[j].count = 0; + input_offset -= it[j].input_span; + output_offset -= it[j].output_span; + output_coord[dim] -= it[j].size - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + + private: + struct BlockIteratorState { + BlockIteratorState() : count(0), size(0), input_stride(0), input_span(0), output_stride(0), output_span(0) {} + + Index count; + Index size; + Index input_stride; + Index input_span; + Index output_stride; + Index output_span; + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(Index index, int dim_index) const { + return (!internal::index_pair_first_statically_eq(dim_index, 0) && + index < m_padding[dim_index].first) || + (!internal::index_pair_second_statically_eq(dim_index, 0) && + index >= m_dimensions[dim_index] - m_padding[dim_index].second); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(int dim_index) const { + return internal::index_pair_first_statically_eq(dim_index, 0); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(int dim_index) const { + return internal::index_pair_second_statically_eq(dim_index, 0); + } + + void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { + const double in = static_cast(m_impl.dimensions()[i]); + const double out = in + m_padding[i].first + m_padding[i].second; + if (out == 0) return; + const double reduction = in / out; + cost *= reduction; + if (first) { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + reduction * (1 * TensorOpCost::AddCost())); + } else { + cost += TensorOpCost(0, 0, + 2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + reduction * (2 * TensorOpCost::MulCost() + 1 * TensorOpCost::DivCost())); + } + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const Index lastPaddedRight = m_outputStrides[i + 1]; + + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || + (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastPaddedLeft = m_padding[0].first; + const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const Index lastPaddedRight = m_outputStrides[1]; + + if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || + (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[0].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i + 1]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i + 1]; + const Index lastPaddedRight = m_outputStrides[i]; + + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || + (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i + 1]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i + 1]; + } else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastPaddedLeft = m_padding[NumDims - 1].first; + const Index firstPaddedRight = (m_dimensions[NumDims - 1] - m_padding[NumDims - 1].second); + const Index lastPaddedRight = m_outputStrides[NumDims - 1]; + + if (!isLeftPaddingCompileTimeZero(NumDims - 1) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if (!isRightPaddingCompileTimeZero(NumDims - 1) && firstIdx >= firstPaddedRight && + lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } else if ((isLeftPaddingCompileTimeZero(NumDims - 1) && isRightPaddingCompileTimeZero(NumDims - 1)) || + (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[NumDims - 1].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + PaddingDimensions m_padding; + + Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h new file mode 100644 index 00000000000..7b33f1fe19c --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -0,0 +1,260 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorPatch + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor patch class. + * + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions + 1; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorPatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorPatchOp type; +}; + +} // end namespace internal + +template +class TensorPatchOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) + : m_xpr(expr), m_patch_dims(patch_dims) {} + + EIGEN_DEVICE_FUNC const PatchDim& patch_dims() const { return m_patch_dims; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PatchDim m_patch_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorPatchOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value + 1; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + Index num_patches = 1; + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const PatchDim& patch_dims = op.patch_dims(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims - 1; ++i) { + m_dimensions[i] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[NumDims - 1] = num_patches; + + m_inputStrides[0] = 1; + m_patchStrides[0] = 1; + for (int i = 1; i < NumDims - 1; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + m_patchStrides[i] = m_patchStrides[i - 1] * (input_dims[i - 1] - patch_dims[i - 1] + 1); + } + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + m_dimensions[i + 1] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[0] = num_patches; + + m_inputStrides[NumDims - 2] = 1; + m_patchStrides[NumDims - 2] = 1; + for (int i = NumDims - 3; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + m_patchStrides[i] = m_patchStrides[i + 1] * (input_dims[i + 1] - patch_dims[i + 1] + 1); + } + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; + // Find the location of the first element of the patch. + Index patchIndex = index / m_outputStrides[output_stride_index]; + // Find the offset of the element wrt the location of the first element. + Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i]; + patchOffset -= offsetIdx * m_outputStrides[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i + 1]; + patchOffset -= offsetIdx * m_outputStrides[i + 1]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } + inputIndex += (patchIndex + patchOffset); + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; + Index indices[2] = {index, index + PacketSize - 1}; + Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], + indices[1] / m_outputStrides[output_stride_index]}; + Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], + indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]}; + + Index inputIndices[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], patchOffsets[1] / m_outputStrides[i]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i + 1], patchOffsets[1] / m_outputStrides[i + 1]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i + 1]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i + 1]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } + inputIndices[0] += (patchIndices[0] + patchOffsets[0]); + inputIndices[1] += (patchIndices[1] + patchOffsets[1]); + + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } else { + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[PacketSize - 1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize - 1; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (TensorOpCost::DivCost() + TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + array m_patchStrides; + + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h new file mode 100644 index 00000000000..c9c613a1489 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -0,0 +1,309 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// Copyright (C) 2018 Mehdi Goli Codeplay Software Ltd. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H +#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() { +#if defined(EIGEN_GPU_COMPILE_PHASE) + // We don't support 3d kernels since we currently only use 1 and + // 2d kernels. + gpu_assert(threadIdx.z == 0); + return blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); +#else + // Rely on Eigen's random implementation. + return random(); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { + // TODO: Unify with the implementation in the non blocking thread pool. + uint64_t current = *state; + // Update the internal state + *state = current * 6364136223846793005ULL + (stream << 1 | 1); + // Generate the random output (using the PCG-XSH-RS scheme) + return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { + seed = seed ? seed : get_random_seed(); + return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t* state, uint64_t stream) { + unsigned rnd = PCG_XSH_RS_generator(state, stream); + return static_cast(rnd); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform(uint64_t* state, uint64_t stream) { + // Generate 10 random bits for the mantissa, merge with exponent. + unsigned rnd = PCG_XSH_RS_generator(state, stream); + const uint16_t half_bits = static_cast(rnd & 0x3ffu) | (static_cast(15) << 10); + Eigen::half result = Eigen::numext::bit_cast(half_bits); + // Return the final result + return result - Eigen::half(1.0f); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform(uint64_t* state, + uint64_t stream) { + // Generate 7 random bits for the mantissa, merge with exponent. + unsigned rnd = PCG_XSH_RS_generator(state, stream); + const uint16_t half_bits = static_cast(rnd & 0x7fu) | (static_cast(127) << 7); + Eigen::bfloat16 result = Eigen::numext::bit_cast(half_bits); + // Return the final result + return result - Eigen::bfloat16(1.0f); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform(uint64_t* state, uint64_t stream) { + typedef union { + uint32_t raw; + float fp; + } internal; + internal result; + // Generate 23 random bits for the mantissa mantissa + const unsigned rnd = PCG_XSH_RS_generator(state, stream); + result.raw = rnd & 0x7fffffu; + // Set the exponent + result.raw |= (static_cast(127) << 23); + // Return the final result + return result.fp - 1.0f; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform(uint64_t* state, uint64_t stream) { + typedef union { + uint64_t raw; + double dp; + } internal; + internal result; + result.raw = 0; + // Generate 52 random bits for the mantissa + // First generate the upper 20 bits + unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; + // The generate the lower 32 bits + unsigned rnd2 = PCG_XSH_RS_generator(state, stream); + result.raw = (static_cast(rnd1) << 32) | rnd2; + // Set the exponent + result.raw |= (static_cast(1023) << 52); + // Return the final result + return result.dp - 1.0; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex RandomToTypeUniform >(uint64_t* state, + uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), RandomToTypeUniform(state, stream)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex RandomToTypeUniform >(uint64_t* state, + uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), RandomToTypeUniform(state, stream)); +} + +template +class UniformRandomGenerator { + public: + static constexpr bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); +#ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefore, we need two steps to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each + // thread but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each + // thread adds the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the + // construction similar to CUDA Therefore, the thread Id injection is not available at this stage. + // However when the operator() is called the thread ID will be available. So inside the operator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + // to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once = false; +#endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator& other) { + m_state = other.m_state; +#ifdef EIGEN_USE_SYCL + m_exec_once = other.m_exec_once; +#endif + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { +#ifdef EIGEN_USE_SYCL + if (!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side + m_state += (i * 6364136223846793005ULL); + m_exec_once = true; + } +#endif + T result = RandomToTypeUniform(&m_state, i); + return result; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX T values[packetSize]; +#ifdef EIGEN_USE_SYCL + if (!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once = true; + } +#endif + EIGEN_UNROLL_LOOP + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeUniform(&m_state, i); + } + return internal::pload(values); + } + + private: + mutable uint64_t m_state; +#ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; +#endif +}; + +template +struct functor_traits > { + enum { + // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). + Cost = 12 * NumTraits::AddCost * ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), + PacketAccess = UniformRandomGenerator::PacketAccess + }; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t* state, uint64_t stream) { + // Use the ratio of uniform method to generate numbers following a normal + // distribution. See for example Numerical Recipes chapter 7.3.9 for the + // details. + T u, v, q; + do { + u = RandomToTypeUniform(state, stream); + v = T(1.7156) * (RandomToTypeUniform(state, stream) - T(0.5)); + const T x = u - T(0.449871); + const T y = numext::abs(v) + T(0.386595); + q = x * x + y * (T(0.196) * y - T(0.25472) * x); + } while (q > T(0.27597) && (q > T(0.27846) || v * v > T(-4) * numext::log(u) * u * u)); + + return v / u; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex RandomToTypeNormal >(uint64_t* state, + uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), RandomToTypeNormal(state, stream)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex RandomToTypeNormal >(uint64_t* state, + uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), RandomToTypeNormal(state, stream)); +} + +template +class NormalRandomGenerator { + public: + static constexpr bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); +#ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefore, we need two steps to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Therefore, the thread Id injection is not available at this stage. However when the operator() + // is called the thread ID will be available. So inside the operator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + // to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once = false; +#endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator& other) { + m_state = other.m_state; +#ifdef EIGEN_USE_SYCL + m_exec_once = other.m_exec_once; +#endif + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { +#ifdef EIGEN_USE_SYCL + if (!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once = true; + } +#endif + T result = RandomToTypeNormal(&m_state, i); + return result; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX T values[packetSize]; +#ifdef EIGEN_USE_SYCL + if (!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once = true; + } +#endif + EIGEN_UNROLL_LOOP + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeNormal(&m_state, i); + } + return internal::pload(values); + } + + private: + mutable uint64_t m_state; +#ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; +#endif +}; + +template +struct functor_traits > { + enum { + // On average, we need to generate about 3 random numbers + // 15 mul, 8 add, 1.5 logs + Cost = 3 * functor_traits >::Cost + 15 * NumTraits::AddCost + + 8 * NumTraits::AddCost + 3 * functor_traits >::Cost / 2, + PacketAccess = NormalRandomGenerator::PacketAccess + }; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h new file mode 100644 index 00000000000..2ecbb7c209e --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -0,0 +1,1038 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H + +// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, +// so we'll use a macro to make clang happy. +#ifndef KERNEL_FRIEND +#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) +#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 +#else +#define KERNEL_FRIEND friend +#endif +#endif + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorReduction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reduction class. + * + */ + +namespace internal { +template class MakePointer_> +struct traits > : traits { + typedef traits XprTraits; + typedef typename XprTraits::Scalar Scalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + static constexpr int NumDimensions = XprTraits::NumDimensions - array_size::value; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + + template + struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> { + typedef const TensorReductionOp& type; +}; + +template class MakePointer_> +struct nested, 1, + typename eval >::type> { + typedef TensorReductionOp type; +}; + +template +struct DimInitializer { + template + EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims, + const array::value>& reduced, + OutputDims* output_dims, ReducedDims* reduced_dims) { + const int NumInputDims = internal::array_size::value; + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + (*reduced_dims)[reduceIndex] = input_dims[i]; + ++reduceIndex; + } else { + (*output_dims)[outputIndex] = input_dims[i]; + ++outputIndex; + } + } + } +}; + +template <> +struct DimInitializer > { + template + EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims, const array&, Sizes<>*, + array* reduced_dims) { + const int NumInputDims = internal::array_size::value; + for (int i = 0; i < NumInputDims; ++i) { + (*reduced_dims)[i] = input_dims[i]; + } + } +}; + +template +struct are_inner_most_dims { + static const bool value = false; +}; +template +struct preserve_inner_most_dims { + static const bool value = false; +}; + +template +struct are_inner_most_dims { + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_eq(0, 0); + static const bool tmp3 = + index_statically_eq(array_size::value - 1, array_size::value - 1); + static const bool value = tmp1 & tmp2 & tmp3; +}; +template +struct are_inner_most_dims { + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_eq(0, NumTensorDims - array_size::value); + static const bool tmp3 = index_statically_eq(array_size::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2 & tmp3; +}; +template +struct preserve_inner_most_dims { + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_gt(0, 0); + static const bool value = tmp1 & tmp2; +}; +template +struct preserve_inner_most_dims { + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_lt(array_size::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2; +}; + +template +struct GenericDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, + Op& reducer, typename Self::CoeffReturnType* accum) { + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + GenericDimReducer::reduce(self, input, reducer, accum); + } + } +}; +template +struct GenericDimReducer<0, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, + Op& reducer, typename Self::CoeffReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reduce(self.m_impl.coeff(input), accum); + } + } +}; +template +struct GenericDimReducer<-1, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, + typename Self::CoeffReturnType* accum) { + reducer.reduce(self.m_impl.coeff(index), accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce( + const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalize(accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce( + const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) { + using Index = typename Self::Index; + constexpr Index packetSize = internal::unpacket_traits::size; + Index start = 0; + typename Self::PacketReturnType paccum0 = reducer0.template initializePacket(); + if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4 * packetSize) { + const Index VectorizedSize4 = (numValuesToReduce / (4 * packetSize)) * (4 * packetSize); + typename Self::PacketReturnType paccum1 = reducer0.template initializePacket(); + typename Self::PacketReturnType paccum2 = reducer0.template initializePacket(); + typename Self::PacketReturnType paccum3 = reducer0.template initializePacket(); + const Index offset0 = firstIndex; + const Index offset1 = firstIndex + packetSize; + const Index offset2 = firstIndex + 2 * packetSize; + const Index offset3 = firstIndex + 3 * packetSize; + for (Index j = 0; j < VectorizedSize4; j += 4 * packetSize) { + reducer0.reducePacket(self.m_impl.template packet(offset0 + j), &paccum0); + reducer0.reducePacket(self.m_impl.template packet(offset1 + j), &paccum1); + reducer0.reducePacket(self.m_impl.template packet(offset2 + j), &paccum2); + reducer0.reducePacket(self.m_impl.template packet(offset3 + j), &paccum3); + } + reducer0.reducePacket(paccum1, &paccum0); + reducer0.reducePacket(paccum2, &paccum0); + reducer0.reducePacket(paccum3, &paccum0); + start = VectorizedSize4; + } + if (start <= (numValuesToReduce - packetSize)) { + const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + for (Index j = start; j < VectorizedSize; j += packetSize) { + reducer0.reducePacket(self.m_impl.template packet(firstIndex + j), &paccum0); + } + start = VectorizedSize; + } + typename Self::CoeffReturnType accum = reducer0.initialize(); + for (Index j = start; j < numValuesToReduce; ++j) { + reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer0.finalizeBoth(accum, paccum0); + } +}; + +#if !defined(EIGEN_HIPCC) + +// The following implements tree-based reduction, which improves the accuracy +// of sum and mean reductions, since each of the n inputs only participates in +// O(log n) additions. +template +EIGEN_DEVICE_FUNC inline Index LeafSize() { + return 1024; +} +template <> +EIGEN_DEVICE_FUNC inline Index LeafSize() { + return 200; +} +template <> +EIGEN_DEVICE_FUNC inline Index LeafSize() { + return 128; +} + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce( + const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const Index kLeafSize = LeafSize(); + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > kLeafSize) { + const typename Self::Index half = numValuesToReduce / 2; + // Recursively reduce the two halves. + reducer.reduce(reduce(self, firstIndex, half, reducer), &accum); + reducer.reduce(reduce(self, firstIndex + half, numValuesToReduce - half, reducer), &accum); + return reducer.finalize(accum); + } else { + return InnerMostDimReducer::reduce(self, firstIndex, numValuesToReduce, reducer); + } + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce( + const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const Index kLeafSize = LeafSize(); + const typename Self::Index packetSize = internal::unpacket_traits::size; + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > packetSize * kLeafSize) { + // Make sure the split point is aligned on a packet boundary. + const typename Self::Index split = + packetSize * + numext::div_ceil(firstIndex + numext::div_ceil(numValuesToReduce, typename Self::Index(2)), packetSize); + const typename Self::Index num_left = numext::mini(split - firstIndex, numValuesToReduce); + reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum); + if (num_left < numValuesToReduce) { + reducer.reduce(reduce(self, split, numValuesToReduce - num_left, reducer), &accum); + } + return reducer.finalize(accum); + } else { + return InnerMostDimReducer::reduce(self, firstIndex, numValuesToReduce, reducer); + } + } +}; +#endif + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, + typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, + Op& reducer, typename Self::PacketReturnType* accum) { + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + InnerMostDimPreserver::reduce(self, input, reducer, accum); + } + } +}; + +template +struct InnerMostDimPreserver<0, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, + Op& reducer0, typename Self::PacketReturnType* accum0) { + using Index = typename Self::Index; + const Index stride = self.m_reducedStrides[0]; + const Index size = self.m_reducedDims[0]; + if (!Self::ReducerTraits::IsStateful && size >= 16) { + const Index unrolled_size4 = (size / 4) * 4; + typename Self::PacketReturnType accum1 = reducer0.template initializePacket(); + typename Self::PacketReturnType accum2 = reducer0.template initializePacket(); + typename Self::PacketReturnType accum3 = reducer0.template initializePacket(); + for (Index j = 0; j < unrolled_size4; j += 4) { + const Index input0 = firstIndex + j * stride; + reducer0.reducePacket(self.m_impl.template packet(input0), accum0); + const Index input1 = firstIndex + (j + 1) * stride; + reducer0.reducePacket(self.m_impl.template packet(input1), &accum1); + const Index input2 = firstIndex + (j + 2) * stride; + reducer0.reducePacket(self.m_impl.template packet(input2), &accum2); + const Index input3 = firstIndex + (j + 3) * stride; + reducer0.reducePacket(self.m_impl.template packet(input3), &accum3); + } + reducer0.reducePacket(accum1, accum0); + reducer0.reducePacket(accum2, accum0); + reducer0.reducePacket(accum3, accum0); + for (Index j = unrolled_size4; j < size; ++j) { + Index input = firstIndex + j * stride; + reducer0.reducePacket(self.m_impl.template packet(input), accum0); + } + } else { + for (Index j = 0; j < size; ++j) { + Index input = firstIndex + j * stride; + reducer0.reducePacket(self.m_impl.template packet(input), accum0); + } + } + } +}; +template +struct InnerMostDimPreserver<-1, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, + typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +// Default full reducer +template +struct FullReducer { + static constexpr bool HasOptimizedImplementation = false; + + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, + typename Self::EvaluatorPointerType output) { + const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + } +}; + +#ifdef EIGEN_USE_THREADS +// Multithreaded full reducers +template +struct FullReducerShard { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer, + typename Self::CoeffReturnType* output) { + *output = InnerMostDimReducer::reduce(self, firstIndex, numValuesToReduce, reducer); + } +}; + +// Multithreaded full reducer +template +struct FullReducer { + static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful; + static constexpr Index PacketSize = unpacket_traits::size; + + // launch one reducer per thread and accumulate the result. + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, + typename Self::CoeffReturnType* output) { + typedef typename Self::Index Index; + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + if (num_coeffs == 0) { + *output = reducer.finalize(reducer.initialize()); + return; + } + const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, PacketSize); + const Index num_threads = TensorCostModel::numThreads(num_coeffs, cost, device.numThreads()); + if (num_threads == 1) { + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + return; + } + const Index blocksize = num_coeffs / num_threads; + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); + + Barrier barrier(internal::convert_index(numblocks)); + MaxSizeVector shards(numblocks, reducer.initialize()); + for (Index i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, i * blocksize, + blocksize, reducer, &shards[i]); + } + typename Self::CoeffReturnType finalShard; + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer::reduce(self, numblocks * blocksize, + num_coeffs - numblocks * blocksize, reducer); + } else { + finalShard = reducer.initialize(); + } + barrier.Wait(); + + for (Index i = 0; i < numblocks; ++i) { + reducer.reduce(shards[i], &finalShard); + } + *output = reducer.finalize(finalShard); + } +}; + +#endif + +// Default inner reducer +template +struct InnerReducer { + static constexpr bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, + typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; + +// Default outer reducer +template +struct OuterReducer { + static constexpr bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, + typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; + +#ifdef EIGEN_USE_SYCL +// Default Generic reducer +template +struct GenericReducer { + static constexpr bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, + typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; +#endif + +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, + unsigned int*); + +#if defined(EIGEN_HAS_GPU_FP16) +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat( + R, const S, I_, internal::packet_traits::type*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, + internal::packet_traits::type*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); + +#endif + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +#endif + +/** + * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op. + * This allows the reduction to have a different type for the accumulator than the input data type. + * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input + * with the accumulator and the other for reducing two accumulators. + * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for + * some properties of the input. + */ +template +struct ReductionReturnType { +#if defined(EIGEN_USE_SYCL) + typedef std::remove_const_t().initialize())> type; +#else + typedef std::remove_const_t type; +#endif +}; + +} // end namespace internal + +template class MakePointer_> +class TensorReductionOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef std::remove_const_t CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims) + : m_expr(expr), m_dims(dims) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) + : m_expr(expr), m_dims(dims), m_reducer(reducer) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& reducer() const { return m_reducer; } + + protected: + typename XprType::Nested m_expr; + const Dims m_dims; + const Op m_reducer; +}; + +template +struct TensorReductionEvaluatorBase; + +// Eval as rvalue +template class MakePointer_, typename Device> +struct TensorReductionEvaluatorBase, Device> { + typedef internal::reducer_traits ReducerTraits; + typedef Dims ReducedDims; + typedef TensorReductionOp XprType; + typedef typename XprType::Index Index; + typedef ArgType ChildType; + typedef typename TensorEvaluator::Dimensions InputDimensions; + static constexpr int NumInputDims = internal::array_size::value; + static constexpr int NumReducedDims = internal::array_size::value; + static constexpr int NumOutputDims = NumInputDims - NumReducedDims; + typedef std::conditional_t, DSizes > Dimensions; + typedef typename XprType::Scalar Scalar; + typedef TensorReductionEvaluatorBase, Device> Self; + static constexpr bool InputPacketAccess = TensorEvaluator::PacketAccess; + typedef typename internal::ReductionReturnType::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr Index PacketSize = PacketType::size; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + static constexpr int NumPreservedStrides = max_n_1::size; + + // For full reductions +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + static constexpr bool RunningOnGPU = internal::is_same::value; + static constexpr bool RunningOnSycl = false; +#elif defined(EIGEN_USE_SYCL) + static constexpr bool RunningOnSycl = internal::is_same, Eigen::SyclDevice>::value; + static constexpr bool RunningOnGPU = false; +#else + static constexpr bool RunningOnGPU = false; + static constexpr bool RunningOnSycl = false; +#endif + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, + BlockAccess = false, + PreferBlockAccess = true, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + static constexpr bool ReducingInnerMostDims = internal::are_inner_most_dims::value; + static constexpr bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; + static constexpr bool RunningFullReduction = (NumOutputDims == 0); + + EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) { + EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Build the bitmap indicating if an input dimension is reduced or not. + for (int i = 0; i < NumInputDims; ++i) { + m_reduced[i] = false; + } + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op.dims()[i] >= 0); + eigen_assert(op.dims()[i] < NumInputDims); + m_reduced[op.dims()[i]] = true; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + internal::DimInitializer::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); + + // Precompute output strides. + if (NumOutputDims > 0) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + } + } else { + m_outputStrides[static_cast(NumOutputDims - 1)] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + } + } + } + + // Precompute input strides. + if (NumInputDims > 0) { + array input_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i - 1] * input_dims[i - 1]; + } + } else { + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedStrides[reduceIndex] = input_strides[i]; + ++reduceIndex; + } else { + m_preservedStrides[outputIndex] = input_strides[i]; + m_output_to_input_dim_map[outputIndex] = i; + ++outputIndex; + } + } + } + + // Special case for full reductions + if (NumOutputDims == 0) { + m_preservedStrides[0] = internal::array_prod(input_dims); + } + + m_numValuesToReduce = NumOutputDims == 0 ? internal::array_prod(input_dims) + : (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[static_cast(NumOutputDims - 1)]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) { + // Use the FullReducer if possible. + if ((RunningFullReduction && RunningOnSycl) || + (RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && + ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || !RunningOnGPU))) { + bool need_assign = false; + if (!data) { + m_result = static_cast( + m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType)))); + data = m_result; + need_assign = true; + } + Op reducer(m_reducer); + internal::FullReducer::run(*this, reducer, m_device, data); + return need_assign; + } + + // Attempt to use an optimized reduction. + else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) { + bool reducing_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + reducing_inner_dims &= m_reduced[i]; + } else { + reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } + } + if (internal::InnerReducer::HasOptimizedImplementation && + (reducing_inner_dims || ReducingInnerMostDims)) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && + num_values_to_reduce > 128) || + (RunningOnSycl)) { + data = static_cast(m_device.get( + (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } else { + return true; + } + } + Op reducer(m_reducer); + // For SYCL this if always return false + if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, + num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } + } + + bool preserving_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } else { + preserving_inner_dims &= m_reduced[i]; + } + } + if (internal::OuterReducer::HasOptimizedImplementation && preserving_inner_dims) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && + num_values_to_reduce > 32) || + (RunningOnSycl)) { + data = static_cast(m_device.get( + (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } else { + return true; + } + } + Op reducer(m_reducer); + // For SYCL this if always return false + if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, + num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } + } +#if defined(EIGEN_USE_SYCL) + // If there is no Optimised version for SYCL, the reduction expression + // must break into two subexpression and use the SYCL generic Reducer on the device. + if (RunningOnSycl) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + data = static_cast( + m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } + Op reducer(m_reducer); + internal::GenericReducer::run(*this, reducer, m_device, data, num_values_to_reduce, + num_coeffs_to_preserve); + return (m_result != NULL); + } +#endif + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { done(evalSubExprsIfNeededCommon(data)); }); + } +#endif + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + return evalSubExprsIfNeededCommon(data); + } + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + if ((RunningFullReduction || RunningOnGPU) && m_result) { + return *(m_result + index); + } + Op reducer(m_reducer); + if (ReducingInnerMostDims || RunningFullReduction) { + const Index num_values_to_reduce = (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumPreservedStrides - 1]; + return internal::InnerMostDimReducer::reduce(*this, firstInput(index), num_values_to_reduce, reducer); + } else { + typename Self::CoeffReturnType accum = reducer.initialize(); + internal::GenericDimReducer::reduce(*this, firstInput(index), reducer, &accum); + return reducer.finalize(accum); + } + } + + // TODO(bsteiner): provide a more efficient implementation. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); + + if (RunningOnGPU && m_result) { + return internal::pload(m_result + index); + } + + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumPreservedStrides - 1]; + const Index firstIndex = firstInput(index); + for (Index i = 0; i < PacketSize; ++i) { + Op reducer(m_reducer); + values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, + num_values_to_reduce, reducer); + } + } else if (PreservingInnerMostDims) { + const Index firstIndex = firstInput(index); + const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; + // TBD: extend this the the n innermost dimensions that we preserve. + if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { + Op reducer(m_reducer); + typename Self::PacketReturnType accum = reducer.template initializePacket(); + internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); + return reducer.finalizePacket(accum); + } else { + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + } + } else { + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + // Must be called after evalSubExprsIfNeeded(). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + if (RunningFullReduction && m_result) { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } else { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const double compute_cost = num_values_to_reduce * internal::functor_traits::Cost; + return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } + + private: + template + friend struct internal::GenericDimReducer; + template + friend struct internal::InnerMostDimReducer; + template + friend struct internal::InnerMostDimPreserver; + template + friend struct internal::FullReducer; +#ifdef EIGEN_USE_THREADS + template + friend struct internal::FullReducerShard; +#endif +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + template + KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); +#if defined(EIGEN_HAS_GPU_FP16) + template + KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, + internal::packet_traits::type*); + template + KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, + internal::packet_traits::type*); + template + KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); +#endif + template + KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); + + template + KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +#endif + +#if defined(EIGEN_USE_SYCL) + template + friend class TensorSycl::internal::GenericNondeterministicReducer; + // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer + template + friend struct internal::GenericReducer; +#endif + + template + friend struct internal::InnerReducer; + + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + + // Returns the Index in the input tensor of the first value that needs to be + // used to compute the reduction at output index "index". + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + if (ReducingInnerMostDims) { + if (static_cast(Layout) == static_cast(ColMajor)) { + return index * m_preservedStrides[0]; + } else { + return index * m_preservedStrides[NumPreservedStrides - 1]; + } + } + // TBD: optimize the case where we preserve the innermost dimensions. + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (PreservingInnerMostDims) { + eigen_assert(m_preservedStrides[0] == 1); + startInput += index; + } else { + startInput += index * m_preservedStrides[0]; + } + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (PreservingInnerMostDims) { + eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1); + startInput += index; + } else { + startInput += index * m_preservedStrides[NumPreservedStrides - 1]; + } + } + return startInput; + } + + // Bitmap indicating if an input dimension is reduced or not. + array m_reduced; + // Dimensions of the output of the operation. + Dimensions m_dimensions; + // Precomputed strides for the output tensor. + // Avoid zero-sized arrays, since element access fails to compile on GPU. + array m_outputStrides; + array, (std::max)(NumOutputDims, 1)> m_fastOutputStrides; + array m_preservedStrides; + // Map from output to input dimension index. + array m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; + + // Subset of strides of the input tensor for the reduced dimensions. + // Indexed by reduced dimensions. + array m_reducedStrides; + // Size of the input dimensions that are reduced. + // Indexed by reduced dimensions. + array m_reducedDims; + + // Evaluator for the input expression. + TensorEvaluator m_impl; + + // Operation to apply for computing the reduction. + Op m_reducer; + + EvaluatorPointerType m_result; + + const Device EIGEN_DEVICE_REF m_device; +}; + +template class MakePointer_, typename Device> +struct TensorEvaluator, Device> + : public TensorReductionEvaluatorBase, Device> { + typedef TensorReductionEvaluatorBase, Device> Base; + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device) {} +}; + +template class MakePointer_> +struct TensorEvaluator, Eigen::SyclDevice> + : public TensorReductionEvaluatorBase, Eigen::SyclDevice> { + typedef TensorReductionEvaluatorBase, Eigen::SyclDevice> + Base; + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) + : Base(op, device) {} + // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL + // kernel + // Therefore the coeff function should be overridden by for SYCL kernel + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { + return *(this->data() + index); + } + // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL + // kernel + // Therefore the packet function should be overridden by for SYCL kernel + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const { + return internal::pload(this->data() + index); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h new file mode 100644 index 00000000000..c5273e9b82d --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -0,0 +1,958 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple gpu thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another gpu thread +// updated the content of the output address it will try again. +template +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + if (sizeof(T) == 4) { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS(reinterpret_cast(output), oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } else { + gpu_assert(0 && "Wordsize not supported"); + } +#else // EIGEN_CUDA_ARCH >= 300 + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(accum); + EIGEN_UNUSED_VARIABLE(reducer); + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +// We extend atomicExch to support extra data types +template +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#ifdef EIGEN_HAS_GPU_FP16 +template +__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reducePacket(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reducePacket(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } +} +#ifdef EIGEN_GPU_COMPILE_PHASE +// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations +template +__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) { + half2* houtput = reinterpret_cast(output); + half2* haccum = reinterpret_cast(&accum); + for (int i = 0; i < 4; ++i) { + atomicReduce(houtput + i, *(haccum + i), reducer); + } +} +#endif // EIGEN_GPU_COMPILE_PHASE +#endif // EIGEN_HAS_GPU_FP16 + +template <> +__device__ inline void atomicReduce(float* output, float accum, SumReducer&) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + atomicAdd(output, accum); +#else // EIGEN_CUDA_ARCH >= 300 + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(accum); + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, + CoeffType* output) { + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const Index num_threads = blockDim.x * gridDim.x; + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = val; + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, + typename Self::CoeffReturnType* output, + unsigned int* semaphore) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + // Initialize the output value + const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; + if (gridDim.x == 1) { + if (first_index == 0) { + *output = reducer.initialize(); + } + } else { + if (threadIdx.x == 0) { + unsigned int block = atomicCAS(semaphore, 0u, 1u); + if (block == 0) { + // We're the first block to run, initialize the output value + atomicExchCustom(output, reducer.initialize()); + __threadfence(); + atomicExch(semaphore, 2u); + } else { + // Wait for the first block to initialize the output value. + // Use atomicCAS here to ensure that the reads aren't cached + unsigned int val; + do { + val = atomicCAS(semaphore, 2u, 2u); + } while (val < 2u); + } + } + } + + __syncthreads(); + + eigen_assert(gridDim.x == 1 || *semaphore >= 2u); + + typename Self::CoeffReturnType accum = reducer.initialize(); + Index max_iter = numext::mini(num_coeffs - first_index, NumPerThread * BlockSize); + for (Index i = 0; i < max_iter; i += BlockSize) { + const Index index = first_index + i; + eigen_assert(index < num_coeffs); + typename Self::CoeffReturnType val = input.m_impl.coeff(index); + reducer.reduce(val, &accum); + } + +#pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { +#if defined(EIGEN_HIPCC) + // use std::is_floating_point to determine the type of reduced_val + // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error + // and list the float and int versions of __shfl_down as the candidate functions. + if (std::is_floating_point::value) { + reducer.reduce(__shfl_down(static_cast(accum), offset, warpSize), &accum); + } else { + reducer.reduce(__shfl_down(static_cast(accum), offset, warpSize), &accum); + } +#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); +#else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); +#endif + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(output, accum, reducer); + } + + if (gridDim.x > 1 && threadIdx.x == 0) { + // Let the last block reset the semaphore + atomicInc(semaphore, gridDim.x + 1); +#if defined(EIGEN_HIPCC) + __threadfence_system(); +#endif + } +#else // EIGEN_CUDA_ARCH >= 300 + EIGEN_UNUSED_VARIABLE(reducer); + EIGEN_UNUSED_VARIABLE(input); + EIGEN_UNUSED_VARIABLE(num_coeffs); + EIGEN_UNUSED_VARIABLE(output); + EIGEN_UNUSED_VARIABLE(semaphore); + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +#ifdef EIGEN_HAS_GPU_FP16 +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, + Index num_coeffs, half* scratch) { + eigen_assert(blockDim.x == 1); + eigen_assert(gridDim.x == 1); + typedef packet_traits::type packet_type; + Index packet_remainder = num_coeffs % Index(unpacket_traits::size); + if (packet_remainder != 0) { + half2* h2scratch = reinterpret_cast(scratch); + for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) { + *h2scratch = __halves2half2(input.coeff(i), input.coeff(i + 1)); + h2scratch++; + } + if ((num_coeffs & 1) != 0) { + half lastCoeff = input.coeff(num_coeffs - 1); + *h2scratch = __halves2half2(lastCoeff, reducer.initialize()); + } + } else { + packet_type reduce = reducer.template initializePacket(); + internal::pstoreu(scratch, reduce); + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/, + Index num_coeffs, half* output) { + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const Index num_threads = blockDim.x * gridDim.x; + typedef typename packet_traits::type PacketType; + + const Index num_packets = num_coeffs / Index(unpacket_traits::size); + PacketType* p_output = reinterpret_cast(output); + for (Index i = thread_id; i < num_packets; i += num_threads) { + p_output[i] = reducer.template initializePacket(); + } + Index packet_remainder = num_coeffs % Index(unpacket_traits::size); + if (thread_id < packet_remainder) { + output[num_coeffs - packet_remainder + thread_id] = reducer.initialize(); + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, + Index num_coeffs, half* output, + half* scratch) { + typedef typename packet_traits::type PacketType; + const int packet_width = unpacket_traits::size; + eigen_assert(NumPerThread % packet_width == 0); + const Index first_index = blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x; + + // Initialize the output value if it wasn't initialized by the ReductionInitKernel + + if (gridDim.x == 1) { + if (first_index == 0) { + int rem = num_coeffs % packet_width; + if (rem != 0) { + half2* p_scratch = reinterpret_cast(scratch); + pstoreu(scratch, reducer.template initializePacket()); + for (int i = 0; i < rem / 2; i++) { + *p_scratch = __halves2half2(input.coeff(num_coeffs - packet_width + 2 * i), + input.coeff(num_coeffs - packet_width + 2 * i + 1)); + p_scratch++; + } + if ((num_coeffs & 1) != 0) { + half last = input.coeff(num_coeffs - 1); + *p_scratch = __halves2half2(last, reducer.initialize()); + } + } else { + PacketType reduce = reducer.template initializePacket(); + pstoreu(scratch, reduce); + } + } + __syncthreads(); + } + + PacketType accum = reducer.template initializePacket(); + const Index max_iter = + numext::mini((num_coeffs - first_index) / packet_width, NumPerThread * BlockSize / packet_width); + for (Index i = 0; i < max_iter; i += BlockSize) { + const Index index = first_index + packet_width * i; + eigen_assert(index + packet_width < num_coeffs); + PacketType val = input.template packet(index); + reducer.reducePacket(val, &accum); + } + +#pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { +#if defined(EIGEN_HIPCC) + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { + int i; + half2 h; + } wka_in, wka_out; + wka_in.h = hacc[i]; + wka_out.i = __shfl_down(wka_in.i, offset, warpSize); + hr[i] = wka_out.h; + } + reducer.reducePacket(r1, &accum); +#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + hr[i] = __shfl_down(hacc[i], offset, warpSize); + } + reducer.reducePacket(r1, &accum); +#else + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize); + } + reducer.reducePacket(r1, &accum); + +#endif + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(reinterpret_cast(scratch), accum, reducer); + } + + __syncthreads(); + half2* rv1 = reinterpret_cast(scratch); + if (packet_width > 2) { + reducer.reducePacket(rv1[2], rv1); + reducer.reducePacket(rv1[3], rv1 + 1); + reducer.reducePacket(rv1[1], rv1); + } + if (gridDim.x == 1) { + if (first_index == 0) { + half tmp = __low2half(*rv1); + reducer.reduce(__high2half(*rv1), &tmp); + *output = tmp; + } + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) { + eigen_assert(threadIdx.x == 1); + typedef packet_traits::type packet_type; + if (unpacket_traits::size == 1) { + *output = *scratch; + } else { + half2* pscratch = reinterpret_cast(scratch); + half tmp = __float2half(0.f); + for (int i = 0; i < unpacket_traits::size; i += 2) { + reducer.reduce(__low2half(*pscratch), &tmp); + reducer.reduce(__high2half(*pscratch), &tmp); + pscratch++; + } + *output = tmp; + } +} + +#endif // EIGEN_HAS_GPU_FP16 + +template +struct FullReductionLauncher { + static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { + gpu_assert(false && "Should only be called on doubles, floats and half floats"); + } +}; + +// Specialization for float and double +template +struct FullReductionLauncher< + Self, Op, OutputType, PacketAccess, + std::enable_if_t::value || internal::is_same::value, + void>> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, + typename Self::Index num_coeffs) { + typedef typename Self::Index Index; + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); + + unsigned int* semaphore = NULL; + if (num_blocks > 1) { + semaphore = device.semaphore(); + } + + LAUNCH_GPU_KERNEL((FullReductionKernel), num_blocks, block_size, 0, + device, reducer, self, num_coeffs, output, semaphore); + } +}; + +#ifdef EIGEN_HAS_GPU_FP16 +template +struct FullReductionLauncher { + static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { + gpu_assert(false && "Should not be called since there is no packet accessor"); + } +}; + +template +struct FullReductionLauncher { + static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, + typename Self::Index num_coeffs) { + typedef typename Self::Index Index; + + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); + half* scratch = static_cast(device.scratchpad()); + + if (num_blocks > 1) { + // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat), 1, 1, 0, device, reducer, self, + num_coeffs, scratch); + } + + LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat), num_blocks, + block_size, 0, device, reducer, self, num_coeffs, output, scratch); + + if (num_blocks > 1) { + LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat), 1, 1, 0, device, reducer, output, scratch); + } + } +}; +#endif // EIGEN_HAS_GPU_FP16 + +template +struct FullReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple cases + // of doubles, floats and half floats +#ifdef EIGEN_HAS_GPU_FP16 + static constexpr bool HasOptimizedImplementation = + !Self::ReducerTraits::IsStateful && (internal::is_same::value || + internal::is_same::value || + (internal::is_same::value && + reducer_traits::PacketAccess)); +#else // EIGEN_HAS_GPU_FP16 + static constexpr bool HasOptimizedImplementation = + !Self::ReducerTraits::IsStateful && (internal::is_same::value || + internal::is_same::value); +#endif // EIGEN_HAS_GPU_FP16 + + template + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return; + } + + FullReductionLauncher::PacketAccess>::run(self, reducer, device, + output, num_coeffs); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, + Index num_coeffs_to_reduce, + Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + typedef typename Self::CoeffReturnType Type; + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + + const Index input_col_blocks = numext::div_ceil(num_coeffs_to_reduce, blockDim.x * NumPerThread); + const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = i / input_col_blocks; + + if (row < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; + + Type reduced_val = reducer.initialize(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); + if (last_col >= num_coeffs_to_reduce) { + for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { + const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + reducer.reduce(val, &reduced_val); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k); + reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val); + } + } + } + +#pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { +#if defined(EIGEN_HIPCC) + // use std::is_floating_point to determine the type of reduced_val + // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error + // and list the float and int versions of __shfl_down as the candidate functions. + if (std::is_floating_point::value) { + reducer.reduce(__shfl_down(static_cast(reduced_val), offset), &reduced_val); + } else { + reducer.reduce(__shfl_down(static_cast(reduced_val), offset), &reduced_val); + } +#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); +#else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); +#endif + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(&(output[row]), reduced_val, reducer); + } + } + } +#else // EIGEN_CUDA_ARCH >= 300 + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +#ifdef EIGEN_HAS_GPU_FP16 + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, + Index num_coeffs_to_reduce, + Index num_preserved_coeffs, half* output) { + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + typedef typename packet_traits::type PacketType; + const int packet_width = unpacket_traits::size; + const int unroll_times = 16 / packet_width; + eigen_assert(NumPerThread % unroll_times == 0); + eigen_assert(unroll_times % 2 == 0); + + const Index input_col_blocks = numext::div_ceil(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); + const Index num_input_blocks = numext::div_ceil(input_col_blocks * num_preserved_coeffs, 2); + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + Index i = packet_width * thread_id; + for (; i + packet_width <= num_preserved_coeffs; i += packet_width * num_threads) { + PacketType* poutput = reinterpret_cast(output + i); + *poutput = reducer.template initializePacket(); + } + if (i < num_preserved_coeffs) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows + + if (row + 1 < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x); + + PacketType reduced_val1 = reducer.template initializePacket(); + PacketType reduced_val2 = reducer.template initializePacket(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * packet_width; + if (last_col >= num_coeffs_to_reduce) { + Index col = col_begin + blockDim.x * j; + for (; col + packet_width <= num_coeffs_to_reduce; col += blockDim.x) { + const PacketType val1 = input.m_impl.template packet(row * num_coeffs_to_reduce + col); + reducer.reducePacket(val1, &reduced_val1); + const PacketType val2 = input.m_impl.template packet((row + 1) * num_coeffs_to_reduce + col); + reducer.reducePacket(val2, &reduced_val2); + } + if (col < num_coeffs_to_reduce) { + PacketType r1 = reducer.template initializePacket(); + PacketType r2 = reducer.template initializePacket(); + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + while (col + 1 < num_coeffs_to_reduce) { + *hr1 = __halves2half2(input.m_impl.coeff(row * num_coeffs_to_reduce + col), + input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1)); + *hr2 = __halves2half2(input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col), + input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + 1)); + hr1++; + hr2++; + col += 2; + } + if (col < num_coeffs_to_reduce) { + // Peel; + const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + *hr1 = __halves2half2(last1, reducer.initialize()); + const half last2 = input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col); + *hr2 = __halves2half2(last2, reducer.initialize()); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k) * packet_width; + reducer.reducePacket(input.m_impl.template packet(row * num_coeffs_to_reduce + col), + &reduced_val1); + reducer.reducePacket(input.m_impl.template packet((row + 1) * num_coeffs_to_reduce + col), + &reduced_val2); + } + } + } + +#pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { +#if defined(EIGEN_HIPCC) + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + for (int i = 0; i < packet_width / 2; i++) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { + int i; + half2 h; + } wka_in1, wka_out1; + wka_in1.h = rv1[i]; + wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize); + hr1[i] = wka_out1.h; + + union { + int i; + half2 h; + } wka_in2, wka_out2; + wka_in2.h = rv2[i]; + wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize); + hr2[i] = wka_out2.h; + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); +#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + for (int i = 0; i < packet_width / 2; i++) { + hr1[i] = __shfl_down(rv1[i], offset, warpSize); + hr2[i] = __shfl_down(rv2[i], offset, warpSize); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); +#else + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rr1 = reinterpret_cast(&reduced_val1); + half2* rr2 = reinterpret_cast(&reduced_val2); + for (int j = 0; j < packet_width / 2; j++) { + hr1[j] = __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize); + hr2[j] = __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); + +#endif + } + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + half2 val; + if (packet_width > 2) { + reducer.reducePacket(rv1[2], rv1); + reducer.reducePacket(rv1[3], rv1 + 1); + reducer.reducePacket(rv1[1], rv1); + reducer.reducePacket(rv2[2], rv2); + reducer.reducePacket(rv2[3], rv2 + 1); + reducer.reducePacket(rv2[1], rv2); + } + half val1 = __low2half(*rv1); + reducer.reduce(__high2half(*rv1), &val1); + half val2 = __low2half(*rv2); + reducer.reduce(__high2half(*rv2), &val2); + val = __halves2half2(val1, val2); + if ((threadIdx.x & (warpSize - 1)) == 0) { + half* loc = output + row; + atomicReduce(reinterpret_cast(loc), val, reducer); + } + } + } +} + +#endif // EIGEN_HAS_GPU_FP16 + +template +struct InnerReductionLauncher { + static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, + typename Self::Index) { + gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); + return true; + } +}; + +// Specialization for float and double +template +struct InnerReductionLauncher< + Self, Op, OutputType, PacketAccess, + std::enable_if_t::value || internal::is_same::value, + void>> { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, + typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 128; + const int dyn_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + const int dyn_blocks2 = numext::div_ceil(num_preserved_vals, 1024); + const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024; + const int num_blocks2 = numext::mini(max_blocks2, dyn_blocks2); + LAUNCH_GPU_KERNEL((ReductionInitKernel), num_blocks2, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } + + LAUNCH_GPU_KERNEL((InnerReductionKernel), num_blocks, block_size, 0, device, + reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; + +#ifdef EIGEN_HAS_GPU_FP16 +template +struct InnerReductionLauncher { + static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { + gpu_assert(false && "Should not be called since there is no packet accessor"); + return true; + } +}; + +template +struct InnerReductionLauncher { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, + typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + if (num_preserved_vals % 2 != 0) { + // Not supported yet, revert to the slower code path + return true; + } + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = /*256*/ 128; + const int num_per_thread = /*128*/ 64; + const int dyn_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat), 1, 1, 0, device, reducer, self, + num_preserved_vals, output); + } + + LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat), num_blocks, block_size, 0, + device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; +#endif // EIGEN_HAS_GPU_FP16 + +template +struct InnerReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats and half floats. +#ifdef EIGEN_HAS_GPU_FP16 + static constexpr bool HasOptimizedImplementation = + !Self::ReducerTraits::IsStateful && (internal::is_same::value || + internal::is_same::value || + (internal::is_same::value && + reducer_traits::PacketAccess)); +#else // EIGEN_HAS_GPU_FP16 + static constexpr bool HasOptimizedImplementation = + !Self::ReducerTraits::IsStateful && (internal::is_same::value || + internal::is_same::value); +#endif // EIGEN_HAS_GPU_FP16 + + template + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, + typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return true; + } + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 128) { + return true; + } + + return InnerReductionLauncher::PacketAccess>::run( + self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, + Index num_coeffs_to_reduce, + Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + // Do the reduction. + const Index max_iter = num_preserved_coeffs * numext::div_ceil(num_coeffs_to_reduce, NumPerThread); + for (Index i = thread_id; i < max_iter; i += num_threads) { + const Index input_col = i % num_preserved_coeffs; + const Index input_row = (i / num_preserved_coeffs) * NumPerThread; + typename Self::CoeffReturnType reduced_val = reducer.initialize(); + const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce); + for (Index j = input_row; j < max_row; j++) { + typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col); + reducer.reduce(val, &reduced_val); + } + atomicReduce(&(output[input_col]), reduced_val, reducer); + } +} + +template +struct OuterReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats. + static constexpr bool HasOptimizedImplementation = + !Self::ReducerTraits::IsStateful && (internal::is_same::value || + internal::is_same::value); + template + static +#if !defined(EIGEN_HIPCC) + // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error + // (in the cxx11_tensor_reduction_gpu test) + // + // terminate called after throwing an instance of 'std::runtime_error' + // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL... + // + // don't know why this happens (and why is it a runtime error instead of a compile time error) + // + // this will be fixed by HIP PR#457 + EIGEN_DEVICE_FUNC +#endif + bool + run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device"); + return true; + } + + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, + typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 32) { + return true; + } + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 16; + const int dyn_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs in the reduction kernel itself when we don't have to worry + // about race conditions between multiple thread blocks. + const int dyn_blocks2 = numext::div_ceil(num_preserved_vals, 1024); + const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024; + const int num_blocks2 = numext::mini(max_blocks2, dyn_blocks2); + LAUNCH_GPU_KERNEL((ReductionInitKernel), num_blocks2, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } + + LAUNCH_GPU_KERNEL((OuterReductionKernel), num_blocks, block_size, 0, device, + reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; + +#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h new file mode 100644 index 00000000000..6944c033708 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -0,0 +1,588 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorReductionSycl.h + * + * \brief: + * This is the specialization of the reduction operation. Two phase reduction approach + * is used since the GPU does not have Global Synchronization for global memory among + * different work-group/thread block. To solve the problem, we need to create two kernels + * to reduce the data, where the first kernel reduce the data locally and each local + * workgroup/thread-block save the input data into global memory. In the second phase (global reduction) + * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. + * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU: + * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf + * + *****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +template +struct OpDefiner { + typedef typename Vectorise::PacketReturnType PacketReturnType; + typedef Op type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, + const Index &) { + return accumulator; + } +}; + +template +struct OpDefiner, CoeffReturnType, Index, false> { + typedef Eigen::internal::SumReducer type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer &) { + return type(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, + const Index &scale) { + ::Eigen::internal::scalar_quotient_op quotient_op; + return quotient_op(accumulator, CoeffReturnType(scale)); + } +}; + +template +struct OpDefiner, CoeffReturnType, Index, true> { + typedef typename Vectorise::PacketReturnType PacketReturnType; + typedef Eigen::internal::SumReducer type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer &) { + return type(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, + const Index &scale) { + return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1(CoeffReturnType(scale))); + } +}; + +template +struct SecondStepFullReducer { + typedef cl::sycl::accessor + LocalAccessor; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + LocalAccessor scratch; + InputAccessor aI; + OutputAccessor outAcc; + Op op; + SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_) + : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {} + + void operator()(cl::sycl::nd_item<1> itemID) const { + // Our empirical research shows that the best performance will be achieved + // when there is only one element per thread to reduce in the second step. + // in this step the second step reduction time is almost negligible. + // Hence, in the second step of reduction the input size is fixed to the + // local size, thus, there is only one element read per thread. The + // algorithm must be changed if the number of reduce per thread in the + // second step is greater than 1. Otherwise, the result will be wrong. + const Index localid = itemID.get_local_id(0); + auto aInPtr = aI + localid; + auto aOutPtr = outAcc; + CoeffReturnType *scratchptr = scratch.get_pointer(); + CoeffReturnType accumulator = *aInPtr; + + scratchptr[localid] = op.finalize(accumulator); + for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.reduce(scratchptr[localid + offset], &accumulator); + scratchptr[localid] = op.finalize(accumulator); + } + } + if (localid == 0) *aOutPtr = op.finalize(accumulator); + } +}; + +// Full reduction first phase. In this version the vectorization is true and the reduction accept +// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ). +template +class FullReductionKernelFunctor { + public: + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::Index Index; + typedef OpDefiner + OpDef; + + typedef typename OpDef::type Op; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::PacketReturnType PacketReturnType; + typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), PacketReturnType, + CoeffReturnType> + OutType; + typedef cl::sycl::accessor + LocalAccessor; + LocalAccessor scratch; + Evaluator evaluator; + EvaluatorPointerType final_output; + Index rng; + Op op; + + FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, + Index rng_, OpType op_) + : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {} + + void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t compute_reduction( + const cl::sycl::nd_item<1> &itemID) const { + auto output_ptr = final_output; + Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize; + Index globalid = itemID.get_global_id(0); + Index localid = itemID.get_local_id(0); + Index step = Evaluator::PacketSize * itemID.get_global_range(0); + Index start = Evaluator::PacketSize * globalid; + // vectorizable parts + PacketReturnType packetAccumulator = op.template initializePacket(); + for (Index i = start; i < VectorizedRange; i += step) { + op.template reducePacket(evaluator.impl().template packet(i), &packetAccumulator); + } + globalid += VectorizedRange; + // non vectorizable parts + for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { + op.template reducePacket( + ::Eigen::TensorSycl::internal::PacketWrapper::convert_to_packet_type( + evaluator.impl().coeff(i), op.initialize()), + &packetAccumulator); + } + scratch[localid] = packetAccumulator = + OpDef::finalise_op(op.template finalizePacket(packetAccumulator), rng); + // reduction parts // Local size is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = local_range / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.template reducePacket(scratch[localid + offset], &packetAccumulator); + scratch[localid] = op.template finalizePacket(packetAccumulator); + } + } + if (localid == 0) { + output_ptr[itemID.get_group(0)] = + op.finalizeBoth(op.initialize(), op.template finalizePacket(packetAccumulator)); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t compute_reduction( + const cl::sycl::nd_item<1> &itemID) const { + auto output_ptr = final_output; + Index globalid = itemID.get_global_id(0); + Index localid = itemID.get_local_id(0); + // vectorizable parts + CoeffReturnType accumulator = op.initialize(); + // non vectorizable parts + for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { + op.reduce(evaluator.impl().coeff(i), &accumulator); + } + scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng); + + // reduction parts. the local size is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = local_range / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.reduce(scratch[localid + offset], &accumulator); + scratch[localid] = op.finalize(accumulator); + } + } + if (localid == 0) { + output_ptr[itemID.get_group(0)] = op.finalize(accumulator); + } + } +}; + +template +class GenericNondeterministicReducer { + public: + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::Index Index; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + template + GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, + Index range_, Index num_values_to_reduce_) + : evaluator(evaluator_), + output_accessor(output_accessor_), + functor(OpDef::get_op(functor_)), + range(range_), + num_values_to_reduce(num_values_to_reduce_) {} + + void operator()(cl::sycl::nd_item<1> itemID) const { + // This is to bypass the statefull condition in Eigen meanReducer + Op non_const_functor; + std::memcpy(&non_const_functor, &functor, sizeof(Op)); + auto output_accessor_ptr = output_accessor; + Index globalid = static_cast(itemID.get_global_linear_id()); + if (globalid < range) { + CoeffReturnType accum = functor.initialize(); + Eigen::internal::GenericDimReducer::reduce( + evaluator, evaluator.firstInput(globalid), non_const_functor, &accum); + output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce); + } + } + + private: + Evaluator evaluator; + EvaluatorPointerType output_accessor; + Op functor; + Index range; + Index num_values_to_reduce; +}; + +enum class reduction_dim { inner_most, outer_most }; +// default is preserver +template +struct PartialReductionKernel { + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::Index Index; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + typedef cl::sycl::accessor + ScratchAcc; + ScratchAcc scratch; + Evaluator evaluator; + EvaluatorPointerType output_accessor; + Op op; + const Index preserve_elements_num_groups; + const Index reduce_elements_num_groups; + const Index num_coeffs_to_preserve; + const Index num_coeffs_to_reduce; + + PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, + const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, + const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_) + : scratch(scratch_), + evaluator(evaluator_), + output_accessor(output_accessor_), + op(OpDef::get_op(op_)), + preserve_elements_num_groups(preserve_elements_num_groups_), + reduce_elements_num_groups(reduce_elements_num_groups_), + num_coeffs_to_preserve(num_coeffs_to_preserve_), + num_coeffs_to_reduce(num_coeffs_to_reduce_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, + CoeffReturnType &accumulator) const { + if (globalPId >= num_coeffs_to_preserve) { + return; + } + Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve) + : globalRId + (globalPId * num_coeffs_to_reduce); + Index localOffset = globalRId; + + const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups; + const Index per_thread_global_stride = + rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride; + for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) { + op.reduce(evaluator.impl().coeff(global_offset), &accumulator); + localOffset += per_thread_local_stride; + global_offset += per_thread_global_stride; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + const Index linearLocalThreadId = itemID.get_local_id(0); + Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP + : linearLocalThreadId / PannelParameters::LocalThreadSizeR; + Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP + : linearLocalThreadId % PannelParameters::LocalThreadSizeR; + const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups + : itemID.get_group(0) / reduce_elements_num_groups; + const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups + : itemID.get_group(0) % reduce_elements_num_groups; + + Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; + const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId; + CoeffReturnType *scratchPtr = scratch.get_pointer(); + auto outPtr = output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0); + CoeffReturnType accumulator = op.initialize(); + + element_wise_reduce(globalRId, globalPId, accumulator); + + accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce); + scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] = + accumulator; + if (rt == reduction_dim::inner_most) { + pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP; + rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP; + globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; + } + + /* Apply the reduction operation between the current local + * id and the one on the other half of the vector. */ + auto out_scratch_ptr = + scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC))); + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (rt == reduction_dim::inner_most) { + accumulator = *out_scratch_ptr; + } + // The Local LocalThreadSizeR is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) { + if (rLocalThreadId < offset) { + op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator); + // The result has already been divided for mean reducer in the + // previous reduction so no need to divide furthermore + *out_scratch_ptr = op.finalize(accumulator); + } + /* All threads collectively read from global memory into local. + * The barrier ensures all threads' IO is resolved before + * execution continues (strictly speaking, all threads within + * a single work-group - there is no co-ordination between + * work-groups, only work-items). */ + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) { + outPtr[globalPId] = op.finalize(accumulator); + } + } +}; + +template +struct SecondStepPartialReduction { + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + typedef cl::sycl::accessor + ScratchAccessor; + InputAccessor input_accessor; + OutputAccessor output_accessor; + Op op; + const Index num_coeffs_to_preserve; + const Index num_coeffs_to_reduce; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, + OutputAccessor output_accessor_, OpType op_, + const Index num_coeffs_to_preserve_, + const Index num_coeffs_to_reduce_) + : input_accessor(input_accessor_), + output_accessor(output_accessor_), + op(OpDef::get_op(op_)), + num_coeffs_to_preserve(num_coeffs_to_preserve_), + num_coeffs_to_reduce(num_coeffs_to_reduce_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + const Index globalId = itemID.get_global_id(0); + + if (globalId >= num_coeffs_to_preserve) return; + + auto in_ptr = input_accessor + globalId; + + OutScalar accumulator = op.initialize(); + // num_coeffs_to_reduce is not bigger that 256 + for (Index i = 0; i < num_coeffs_to_reduce; i++) { + op.reduce(*in_ptr, &accumulator); + in_ptr += num_coeffs_to_preserve; + } + output_accessor[globalId] = op.finalize(accumulator); + } +}; // namespace internal + +template +struct ReductionPannel { + static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP; + static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR; + static EIGEN_CONSTEXPR bool BC = BC_; +}; + +template +struct PartialReducerLauncher { + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::Storage Storage; + typedef typename Self::Index Index; + typedef ReductionPannel + PannelParameters; + + typedef PartialReductionKernel SyclReducerKerneType; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, + Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) { + Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP); + + // getPowerOfTwo makes sure local range is power of 2 and <= + // maxSyclThreadPerBlock this will help us to avoid extra check on the + // kernel + static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) & + (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + + EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR; + // In this step, we force the code not to be more than 2-step reduction: + // Our empirical research shows that if each thread reduces at least 64 + // elements individually, we get better performance. However, this can change + // on different platforms. In this step we force the code not to be + // morthan step reduction: Our empirical research shows that for inner_most + // dim reducer, it is better to have 8 group in a reduce dimension for sizes + // > 1024 to achieve the best performance. + const Index reductionPerThread = 64; + Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true); + const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP; + Index rGroups = (cu + pNumGroups - 1) / pNumGroups; + const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1; + const Index globalRange = pNumGroups * rNumGroups * localRange; + + EIGEN_CONSTEXPR Index scratchSize = + PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC); + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (rNumGroups > 1) { + CoeffReturnType *temp_pointer = static_cast( + dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType temp_accessor = dev.get(temp_pointer); + dev.template unary_kernel_launcher( + self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, + num_coeffs_to_reduce) + .wait(); + typedef SecondStepPartialReduction + SecondStepPartialReductionKernel; + dev.template unary_kernel_launcher( + temp_accessor, output, + cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), + Index(1), reducer, num_coeffs_to_preserve, rNumGroups) + .wait(); + self.device().deallocate_temp(temp_pointer); + } else { + dev.template unary_kernel_launcher( + self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, + num_coeffs_to_reduce) + .wait(); + } + return false; + } +}; +} // namespace internal +} // namespace TensorSycl + +namespace internal { + +template +struct FullReducer { + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1; + static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) { + typedef std::conditional_t OutType; + static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + + typename Self::Index inputSize = self.impl().dimensions().TotalSize(); + // In this step we force the code not to be more than 2-step reduction: + // Our empirical research shows that if each thread reduces at least 512 + // elements individually, we get better performance. + const Index reductionPerThread = 2048; + // const Index num_work_group = + Index reductionGroup = dev.getPowerOfTwo( + (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true); + const Index num_work_group = std::min(reductionGroup, local_range); + // 1 + // ? local_range + // : 1); + const Index global_range = num_work_group * local_range; + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + typedef TensorSycl::internal::FullReductionKernelFunctor reduction_kernel_t; + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType))); + typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); + dev.template unary_kernel_launcher(self, tmp_global_accessor, thread_range, + local_range, inputSize, reducer) + .wait(); + typedef TensorSycl::internal::SecondStepFullReducer + GenericRKernel; + dev.template unary_kernel_launcher( + tmp_global_accessor, data, + cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), + num_work_group, reducer) + .wait(); + dev.deallocate_temp(temp_pointer); + } else { + dev.template unary_kernel_launcher(self, data, thread_range, local_range, inputSize, + reducer) + .wait(); + } + } +}; +// vectorizable inner_most most dim preserver +// col reduction +template +struct OuterReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + return ::Eigen::TensorSycl::internal::PartialReducerLauncher< + Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output, + num_coeffs_to_reduce, + num_coeffs_to_preserve); + } +}; +// row reduction +template +struct InnerReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + return ::Eigen::TensorSycl::internal::PartialReducerLauncher< + Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output, + num_coeffs_to_reduce, + num_coeffs_to_preserve); + } +}; + +// ArmgMax uses this kernel for partial reduction// +// TODO(@mehdi.goli) come up with a better kernel +// generic partial reduction +template +struct GenericReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false; + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + typename Self::Index range, GRange, tileSize; + dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); + + dev.template unary_kernel_launcher>( + self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1), + reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast(1)) + .wait(); + return false; + } +}; + +} // namespace internal +} // namespace Eigen + +#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h new file mode 100644 index 00000000000..7061f512093 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -0,0 +1,317 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H +#define EIGEN_CXX11_TENSOR_TENSOR_REF_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +template +class TensorLazyBaseEvaluator { + public: + TensorLazyBaseEvaluator() : m_refcount(0) {} + virtual ~TensorLazyBaseEvaluator() {} + + EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0; + EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0; + + EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0; + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0; + + void incrRefCount() { ++m_refcount; } + void decrRefCount() { --m_refcount; } + int refCount() const { return m_refcount; } + + private: + // No copy, no assignment; + TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other); + TensorLazyBaseEvaluator& operator=(const TensorLazyBaseEvaluator& other); + + int m_refcount; +}; + +template +class TensorLazyEvaluatorReadOnly + : public TensorLazyBaseEvaluator::Scalar> { + public: + // typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Scalar Scalar; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef TensorEvaluator EvalType; + + TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) { + m_dims = m_impl.dimensions(); + m_impl.evalSubExprsIfNeeded(NULL); + } + virtual ~TensorLazyEvaluatorReadOnly() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const { return m_dims; } + EIGEN_DEVICE_FUNC virtual const Scalar* data() const { return m_impl.data(); } + + EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const { return m_impl.coeff(index); } + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) { + eigen_assert(false && "can't reference the coefficient of a rvalue"); + return m_dummy; + }; + + protected: + TensorEvaluator m_impl; + Dimensions m_dims; + Scalar m_dummy; +}; + +template +class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly { + public: + typedef TensorLazyEvaluatorReadOnly Base; + typedef typename Base::Scalar Scalar; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {} + virtual ~TensorLazyEvaluatorWritable() {} + + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) { return this->m_impl.coeffRef(index); } +}; + +template +class TensorLazyEvaluator : public std::conditional_t::value), + TensorLazyEvaluatorWritable, + TensorLazyEvaluatorReadOnly > { + public: + typedef std::conditional_t::value), + TensorLazyEvaluatorWritable, + TensorLazyEvaluatorReadOnly > + Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {} + virtual ~TensorLazyEvaluator() {} +}; + +} // namespace internal + +/** \class TensorRef + * \ingroup CXX11_Tensor_Module + * + * \brief A reference to a tensor expression + * The expression will be evaluated lazily (as much as possible). + * + */ +template +class TensorRef : public TensorBase > { + public: + typedef TensorRef Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + static constexpr Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + static constexpr int Layout = PlainObjectType::Layout; + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {} + + template + EIGEN_STRONG_INLINE TensorRef(const Expression& expr) + : m_evaluator(new internal::TensorLazyEvaluator(expr, DefaultDevice())) { + m_evaluator->incrRefCount(); + } + + template + EIGEN_STRONG_INLINE TensorRef& operator=(const Expression& expr) { + unrefEvaluator(); + m_evaluator = new internal::TensorLazyEvaluator(expr, DefaultDevice()); + m_evaluator->incrRefCount(); + return *this; + } + + ~TensorRef() { unrefEvaluator(); } + + TensorRef(const TensorRef& other) : TensorBase >(other), m_evaluator(other.m_evaluator) { + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + + TensorRef& operator=(const TensorRef& other) { + if (this != &other) { + unrefEvaluator(); + m_evaluator = other.m_evaluator; + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(Index index) const { return m_evaluator->coeff(index); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const { + const std::size_t num_indices = (sizeof...(otherIndices) + 1); + const array indices{{firstIndex, otherIndices...}}; + return coeff(indices); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) { + const std::size_t num_indices = (sizeof...(otherIndices) + 1); + const array indices{{firstIndex, otherIndices...}}; + return coeffRef(indices); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(const array& indices) const { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (size_t i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices - 1]; + for (int i = NumIndices - 2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeff(index); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (size_t i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices - 1]; + for (int i = NumIndices - 2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const { return m_evaluator->coeff(index); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_evaluator->coeffRef(index); } + + private: + EIGEN_STRONG_INLINE void unrefEvaluator() { + if (m_evaluator) { + m_evaluator->decrRefCount(); + if (m_evaluator->refCount() == 0) { + delete m_evaluator; + } + } + } + + internal::TensorLazyBaseEvaluator* m_evaluator; +}; + +// evaluator for rvalues +template +struct TensorEvaluator, Device> { + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorRef::Layout; + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) : m_ref(m) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_ref.coeff(index); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_ref.coeffRef(index); } + + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); } + + protected: + TensorRef m_ref; +}; + +// evaluator for lvalues +template +struct TensorEvaluator, Device> : public TensorEvaluator, Device> { + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + typedef TensorEvaluator, Device> Base; + + enum { IsAligned = false, PacketAccess = false, BlockAccess = false, PreferBlockAccess = false, RawAccess = false }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(TensorRef& m, const Device& d) : Base(m, d) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return this->m_ref.coeffRef(index); } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h new file mode 100644 index 00000000000..a81a2babfda --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -0,0 +1,410 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Navdeep Jaitly +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorReverse + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reverse elements class. + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorReverseOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorReverseOp type; +}; + +} // end namespace internal + +template +class TensorReverseOp : public TensorBase, WriteAccessors> { + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr, const ReverseDimensions& reverse_dims) + : m_xpr(expr), m_reverse_dims(reverse_dims) {} + + EIGEN_DEVICE_FUNC const ReverseDimensions& reverse() const { return m_reverse_dims; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp) + + protected: + typename XprType::Nested m_xpr; + const ReverseDimensions m_reverse_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorReverseOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = NumDims > 0, + PreferBlockAccess = true, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef internal::TensorIntDivisor IndexDivisor; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reverse(op.reverse()), m_device(device) { + // Reversing a scalar isn't supported yet. It would be a no-op anyway. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Compute strides + m_dimensions = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]); + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(Index index) const { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + Index idx = index / m_fastStrides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i]; + } + if (m_reverse[0]) { + inputIndex += (m_dimensions[0] - index - 1); + } else { + inputIndex += index; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + Index idx = index / m_fastStrides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i]; + } + if (m_reverse[NumDims - 1]) { + inputIndex += (m_dimensions[NumDims - 1] - index - 1); + } else { + inputIndex += index; + } + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_impl.coeff(reverseIndex(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + // TODO(ndjaitly): write a better packing routine that uses + // local structure. + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + // Block evaluation reads underlying memory in reverse order, and default + // cost model does not properly catch this in bytes stored/loaded. + return internal::TensorBlockResourceRequirements::skewed(target_size).addCostPerCoeff({0, 0, 24}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // TODO(ezhulenev): If underlying tensor expression supports and prefers + // block evaluation we must use it. Currently we use coeff and packet + // access into the underlying tensor expression. + // static const bool useBlockAccessForArgType = + // TensorEvaluator::BlockAccess && + // TensorEvaluator::PreferBlockAccess; + + static const bool isColMajor = static_cast(Layout) == static_cast(ColMajor); + + static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1; + const bool inner_dim_reversed = m_reverse[inner_dim_idx]; + + // Offset in the output block. + Index block_offset = 0; + + // Offset in the input Tensor. + Index input_offset = reverseIndex(desc.offset()); + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].count = 0; + it[i].reverse = m_reverse[dim]; + + it[i].block_stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride); + it[i].block_span = it[i].block_stride * (it[i].size - 1); + + it[i].input_stride = m_strides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + if (it[i].reverse) { + it[i].input_stride = -1 * it[i].input_stride; + it[i].input_span = -1 * it[i].input_span; + } + } + + // If multiple inner dimensions have the same reverse flag, check if we can + // merge them into a single virtual inner dimension. + int effective_inner_dim = 0; + for (int i = 1; i < NumDims; ++i) { + if (it[i].reverse != it[effective_inner_dim].reverse) break; + if (it[i].block_stride != it[effective_inner_dim].size) break; + if (it[i].block_stride != numext::abs(it[i].input_stride)) break; + + it[i].size = it[effective_inner_dim].size * it[i].size; + + it[i].block_stride = 1; + it[i].input_stride = (inner_dim_reversed ? -1 : 1); + + it[i].block_span = it[i].block_stride * (it[i].size - 1); + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + effective_inner_dim = i; + } + + eigen_assert(it[effective_inner_dim].block_stride == 1); + eigen_assert(it[effective_inner_dim].input_stride == (inner_dim_reversed ? -1 : 1)); + + const Index inner_dim_size = it[effective_inner_dim].size; + + // Prepare storage for the materialized reverse result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + CoeffReturnType* block_buffer = block_storage.data(); + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + // Copy inner-most dimension data from reversed location in input. + Index dst = block_offset; + Index src = input_offset; + + // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed + // worse results in benchmarks than a simple coefficient loop. + if (inner_dim_reversed) { + for (Index i = 0; i < inner_dim_size; ++i) { + block_buffer[dst] = m_impl.coeff(src); + ++dst; + --src; + } + } else { + for (Index i = 0; i < inner_dim_size; ++i) { + block_buffer[dst] = m_impl.coeff(src); + ++dst; + ++src; + } + } + + // For the 1d tensor we need to generate only one inner-most dimension. + if ((NumDims - effective_inner_dim) == 1) break; + + // Update offset. + for (Index i = effective_inner_dim + 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + block_offset += it[i].block_stride; + input_offset += it[i].input_stride; + break; + } + if (i != NumDims - 1) it[i].count = 0; + block_offset -= it[i].block_span; + input_offset -= it[i].input_span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + for (int i = 0; i < NumDims; ++i) { + if (m_reverse[i]) { + compute_cost += 2 * TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_strides; + array m_fastStrides; + TensorEvaluator m_impl; + ReverseDimensions m_reverse; + const Device EIGEN_DEVICE_REF m_device; + + private: + struct BlockIteratorState { + BlockIteratorState() + : size(0), count(0), reverse(false), block_stride(0), block_span(0), input_stride(0), input_span(0) {} + + Index size; + Index count; + bool reverse; + Index block_stride; + Index block_span; + Index input_stride; + Index input_span; + }; +}; + +// Eval as lvalue + +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorReverseOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const { + return this->m_impl.coeffRef(this->reverseIndex(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + // This code is pilfered from TensorMorphing.h + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index + i) = values[i]; + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h new file mode 100644 index 00000000000..d5b203ab619 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h @@ -0,0 +1,361 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Tobias Wood tobias@spinicist.org.uk +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ROLL_H +#define EIGEN_CXX11_TENSOR_TENSOR_ROLL_H +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorRoll + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor roll (circular shift) elements class. + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorRollOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorRollOp type; +}; + +} // end namespace internal + +template +class TensorRollOp : public TensorBase, WriteAccessors> { + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorRollOp(const XprType& expr, const RollDimensions& roll_dims) + : m_xpr(expr), m_roll_dims(roll_dims) {} + + EIGEN_DEVICE_FUNC const RollDimensions& roll() const { return m_roll_dims; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorRollOp) + + protected: + typename XprType::Nested m_xpr; + const RollDimensions m_roll_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorRollOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = NumDims > 0, + PreferBlockAccess = true, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef internal::TensorIntDivisor IndexDivisor; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + using TensorBlockDesc = internal::TensorBlockDescriptor; + using TensorBlockScratch = internal::TensorBlockScratchAllocator; + using ArgTensorBlock = typename TensorEvaluator::TensorBlock; + using TensorBlock = typename internal::TensorMaterializedBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_rolls(op.roll()), m_device(device) { + EIGEN_STATIC_ASSERT((NumDims > 0), Must_Have_At_Least_One_Dimension_To_Roll); + + // Compute strides + m_dimensions = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] > 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] > 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(nullptr); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index roll(Index const i, Index const r, Index const n) const { + auto const tmp = (i + r) % n; + if (tmp < 0) { + return tmp + n; + } else { + return tmp; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array rollCoords(array const& coords) const { + array rolledCoords; + for (int id = 0; id < NumDims; id++) { + eigen_assert(coords[id] < m_dimensions[id]); + rolledCoords[id] = roll(coords[id], m_rolls[id], m_dimensions[id]); + } + return rolledCoords; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rollIndex(Index index) const { + eigen_assert(index < dimensions().TotalSize()); + Index rolledIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + rolledIndex += roll(idx, m_rolls[i], m_dimensions[i]) * m_strides[i]; + } + rolledIndex += roll(index, m_rolls[0], m_dimensions[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + rolledIndex += roll(idx, m_rolls[i], m_dimensions[i]) * m_strides[i]; + } + rolledIndex += roll(index, m_rolls[NumDims - 1], m_dimensions[NumDims - 1]); + } + return rolledIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_impl.coeff(rollIndex(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::skewed(target_size).addCostPerCoeff({0, 0, 24}); + } + + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + static const bool is_col_major = static_cast(Layout) == static_cast(ColMajor); + + // Compute spatial coordinates for the first block element. + array coords; + extract_coordinates(desc.offset(), coords); + array initial_coords = coords; + Index offset = 0; // Offset in the output block buffer. + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims; ++i) { + const int dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride); + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + eigen_assert(it[0].stride == 1); + + // Prepare storage for the materialized generator result. + const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch); + CoeffReturnType* block_buffer = block_storage.data(); + + static const int inner_dim = is_col_major ? 0 : NumDims - 1; + const Index inner_dim_size = it[0].size; + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + Index i = 0; + for (; i < inner_dim_size; ++i) { + auto const rolled = rollCoords(coords); + auto const index = is_col_major ? m_dimensions.IndexOfColMajor(rolled) : m_dimensions.IndexOfRowMajor(rolled); + *(block_buffer + offset + i) = m_impl.coeff(index); + coords[inner_dim]++; + } + coords[inner_dim] = initial_coords[inner_dim]; + + if (NumDims == 1) break; // For the 1d tensor we need to generate only one inner-most dimension. + + // Update offset. + for (i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + for (int i = 0; i < NumDims; ++i) { + compute_cost += 2 * TensorOpCost::AddCost(); + } + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return nullptr; } + + protected: + Dimensions m_dimensions; + array m_strides; + array m_fast_strides; + TensorEvaluator m_impl; + RollDimensions m_rolls; + const Device EIGEN_DEVICE_REF m_device; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array& coords) const { + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[NumDims - 1] = index; + } + } + + private: +}; + +// Eval as lvalue + +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + typedef TensorRollOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, + RawAccess = false + }; + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const { + return this->m_impl.coeffRef(this->rollIndex(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index + i) = values[i]; + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ROLL_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h new file mode 100644 index 00000000000..d16d1490c8a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -0,0 +1,474 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Igor Babuschkin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H +#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorScanOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorScanOp type; +}; +} // end namespace internal + +/** \class TensorScan + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor scan class. + */ +template +class TensorScanOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(const XprType& expr, const Index& axis, bool exclusive = false, + const Op& op = Op()) + : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const { return m_axis; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const { return m_accumulator; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { return m_exclusive; } + + protected: + typename XprType::Nested m_expr; + const Index m_axis; + const Op m_accumulator; + const bool m_exclusive; +}; + +namespace internal { + +template +EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset, typename Self::CoeffReturnType* data) { + // Compute the scan along the axis, starting at the given offset + typename Self::CoeffReturnType accum = self.accumulator().initialize(); + if (self.stride() == 1) { + if (self.exclusive()) { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } + } else { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } else { + if (self.exclusive()) { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + Index curr = offset + idx3 * self.stride(); + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } + } else { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + Index curr = offset + idx3 * self.stride(); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } +} + +template +EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset, typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + using Packet = typename Self::PacketReturnType; + // Compute the scan along the axis, starting at the calculated offset + Packet accum = self.accumulator().template initializePacket(); + if (self.stride() == 1) { + if (self.exclusive()) { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + } + } else { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + } + } + } else { + if (self.exclusive()) { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + const Index curr = offset + idx3 * self.stride(); + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + } + } else { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + const Index curr = offset + idx3 * self.stride(); + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + } + } + } +} + +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) { + for (Index idx2 = 0; idx2 < self.stride(); idx2++) { + // Calculate the starting offset for the scan + Index offset = idx1 + idx2; + ReduceScalar(self, offset, data); + } + } +}; + +// Specialization for vectorized reduction. +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) { + using Packet = typename Self::PacketReturnType; + const int PacketSize = internal::unpacket_traits::size; + Index idx2 = 0; + for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) { + // Calculate the starting offset for the packet scan + Index offset = idx1 + idx2; + ReducePacket(self, offset, data); + } + for (; idx2 < self.stride(); idx2++) { + // Calculate the starting offset for the scan + Index offset = idx1 + idx2; + ReduceScalar(self, offset, data); + } + } +}; + +// Single-threaded CPU implementation of scan +template ::PacketAccess && + internal::reducer_traits::PacketAccess)> +struct ScanLauncher { + void operator()(Self& self, typename Self::CoeffReturnType* data) const { + Index total_size = internal::array_prod(self.dimensions()); + + // We fix the index along the scan axis to 0 and perform a + // scan per remaining entry. The iteration is split into two nested + // loops to avoid an integer division by keeping track of each idx1 and + // idx2. + for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { + ReduceBlock block_reducer; + block_reducer(self, idx1, data); + } + } +}; + +#ifdef EIGEN_USE_THREADS + +// Adjust block_size to avoid false sharing of cachelines among +// threads. Currently set to twice the cache line size on Intel and ARM +// processors. +EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) { + EIGEN_CONSTEXPR Index kBlockAlignment = 128; + const Index items_per_cacheline = numext::maxi(1, kBlockAlignment / item_size); + return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline); +} + +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + using Packet = typename Self::PacketReturnType; + const int PacketSize = internal::unpacket_traits::size; + Index num_scalars = self.stride(); + Index num_packets = 0; + if (self.stride() >= PacketSize) { + num_packets = self.stride() / PacketSize; + self.device().parallelFor( + num_packets, + TensorOpCost(PacketSize * self.size(), PacketSize * self.size(), 16 * PacketSize * self.size(), true, + PacketSize), + // Make the shard size large enough that two neighboring threads + // won't write to the same cacheline of `data`. + [=](Index blk_size) { return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size); }, + [&](Index first, Index last) { + for (Index packet = first; packet < last; ++packet) { + const Index idx2 = packet * PacketSize; + ReducePacket(self, idx1 + idx2, data); + } + }); + num_scalars -= num_packets * PacketSize; + } + self.device().parallelFor( + num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()), + // Make the shard size large enough that two neighboring threads + // won't write to the same cacheline of `data`. + [=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); }, + [&](Index first, Index last) { + for (Index scalar = first; scalar < last; ++scalar) { + const Index idx2 = num_packets * PacketSize + scalar; + ReduceScalar(self, idx1 + idx2, data); + } + }); + } +}; + +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + self.device().parallelFor( + self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()), + // Make the shard size large enough that two neighboring threads + // won't write to the same cacheline of `data`. + [=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); }, + [&](Index first, Index last) { + for (Index idx2 = first; idx2 < last; ++idx2) { + ReduceScalar(self, idx1 + idx2, data); + } + }); + } +}; + +// Specialization for multi-threaded execution. +template +struct ScanLauncher { + void operator()(Self& self, typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + using Packet = typename Self::PacketReturnType; + const int PacketSize = internal::unpacket_traits::size; + const Index total_size = internal::array_prod(self.dimensions()); + const Index inner_block_size = self.stride() * self.size(); + bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size)); + + if ((parallelize_by_outer_blocks && total_size <= 4096) || + (!parallelize_by_outer_blocks && self.stride() < PacketSize)) { + ScanLauncher launcher; + launcher(self, data); + return; + } + + if (parallelize_by_outer_blocks) { + // Parallelize over outer blocks. + const Index num_outer_blocks = total_size / inner_block_size; + self.device().parallelFor( + num_outer_blocks, + TensorOpCost(inner_block_size, inner_block_size, 16 * PacketSize * inner_block_size, Vectorize, PacketSize), + [=](Index blk_size) { return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size); }, + [&](Index first, Index last) { + for (Index idx1 = first; idx1 < last; ++idx1) { + ReduceBlock block_reducer; + block_reducer(self, idx1 * inner_block_size, data); + } + }); + } else { + // Parallelize over inner packets/scalars dimensions when the reduction + // axis is not an inner dimension. + ReduceBlock block_reducer; + for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { + block_reducer(self, idx1, data); + } + } + } +}; +#endif // EIGEN_USE_THREADS + +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + +// GPU implementation of scan +// TODO(ibab) This placeholder implementation performs multiple scans in +// parallel, but it would be better to use a parallel scan algorithm and +// optimize memory access. +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, + typename Self::CoeffReturnType* data) { + // Compute offset as in the CPU version + Index val = threadIdx.x + blockIdx.x * blockDim.x; + Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); + + if (offset + (self.size() - 1) * self.stride() < total_size) { + // Compute the scan along the axis, starting at the calculated offset + typename Self::CoeffReturnType accum = self.accumulator().initialize(); + for (Index idx = 0; idx < self.size(); idx++) { + Index curr = offset + idx * self.stride(); + if (self.exclusive()) { + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } else { + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } + __syncthreads(); +} + +template +struct ScanLauncher { + void operator()(const Self& self, typename Self::CoeffReturnType* data) { + Index total_size = internal::array_prod(self.dimensions()); + Index num_blocks = (total_size / self.size() + 63) / 64; + Index block_size = 64; + + LAUNCH_GPU_KERNEL((ScanKernel), num_blocks, block_size, 0, self.device(), self, total_size, data); + } +}; +#endif // EIGEN_USE_GPU && (EIGEN_GPUCC) + +} // namespace internal + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorScanOp XprType; + typedef typename XprType::Index Index; + typedef const ArgType ChildTypeNoConst; + typedef const ArgType ChildType; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef std::remove_const_t Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef TensorEvaluator, Device> Self; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + CoordAccess = false, + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), + m_device(device), + m_exclusive(op.exclusive()), + m_accumulator(op.accumulator()), + m_size(m_impl.dimensions()[op.axis()]), + m_stride(1), + m_consume_dim(op.axis()), + m_output(NULL) { + // Accumulating a scalar isn't supported. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(op.axis() >= 0 && op.axis() < NumDims); + + // Compute stride of scan axis + const Dimensions& dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < op.axis(); ++i) { + m_stride = m_stride * dims[i]; + } + } else { + // dims can only be indexed through unsigned integers, + // so let's use an unsigned type to let the compiler knows. + // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized + // in this function" + unsigned int axis = internal::convert_index(op.axis()); + for (unsigned int i = NumDims - 1; i > axis; --i) { + m_stride = m_stride * dims[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { return m_stride; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const { return m_consume_dim; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { return m_size; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { return m_accumulator; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { return m_exclusive; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& inner() const { return m_impl; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + internal::ScanLauncher launcher; + if (data) { + launcher(*this, data); + return false; + } + + const Index total_size = internal::array_prod(dimensions()); + m_output = + static_cast(m_device.get((Scalar*)m_device.allocate_temp(total_size * sizeof(Scalar)))); + launcher(*this, m_output); + return true; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_output + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_output; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_output[index]; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_output) { + m_device.deallocate_temp(m_output); + m_output = NULL; + } + m_impl.cleanup(); + } + + protected: + TensorEvaluator m_impl; + const Device EIGEN_DEVICE_REF m_device; + const bool m_exclusive; + Op m_accumulator; + const Index m_size; + Index m_stride; + Index m_consume_dim; + EvaluatorPointerType m_output; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h new file mode 100644 index 00000000000..30fde91970a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h @@ -0,0 +1,506 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorScanSycl.h + * + * \brief: + * Tensor Scan Sycl implement the extend version of + * "Efficient parallel scan algorithms for GPUs." .for Tensor operations. + * The algorithm requires up to 3 stage (consequently 3 kernels) depending on + * the size of the tensor. In the first kernel (ScanKernelFunctor), each + * threads within the work-group individually reduces the allocated elements per + * thread in order to reduces the total number of blocks. In the next step all + * thread within the work-group will reduce the associated blocks into the + * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary + * buffer is given as an input and all the threads within a work-group scan and + * reduces the boundaries between the blocks (generated from the previous + * kernel). and write the data on the temporary buffer. If the second kernel is + * required, the third and final kernel (ScanAdjustmentKernelFunctor) will + * adjust the final result into the output buffer. + * The original algorithm for the parallel prefix sum can be found here: + * + * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel + * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003 + *1, no. 1 (2008): 1-17. + *****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE +#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4) +#endif + +template +struct ScanParameters { + // must be power of 2 + static EIGEN_CONSTEXPR index_t ScanPerThread = 8; + const index_t total_size; + const index_t non_scan_size; + const index_t scan_size; + const index_t non_scan_stride; + const index_t scan_stride; + const index_t panel_threads; + const index_t group_threads; + const index_t block_threads; + const index_t elements_per_group; + const index_t elements_per_block; + const index_t loop_range; + + ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, + index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, + index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_) + : total_size(total_size_), + non_scan_size(non_scan_size_), + scan_size(scan_size_), + non_scan_stride(non_scan_stride_), + scan_stride(scan_stride_), + panel_threads(panel_threads_), + group_threads(group_threads_), + block_threads(block_threads_), + elements_per_group(elements_per_group_), + elements_per_block(elements_per_block_), + loop_range(loop_range_) {} +}; + +enum class scan_step { first, second }; +template +struct ScanKernelFunctor { + typedef cl::sycl::accessor + LocalAccessor; + static EIGEN_CONSTEXPR int PacketSize = ScanParameters::ScanPerThread / 2; + + LocalAccessor scratch; + Evaluator dev_eval; + OutAccessor out_ptr; + OutAccessor tmp_ptr; + const ScanParameters scanParameters; + Op accumulator; + const bool inclusive; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, + OutAccessor out_accessor_, OutAccessor temp_accessor_, + const ScanParameters scanParameters_, Op accumulator_, + const bool inclusive_) + : scratch(scratch_), + dev_eval(dev_eval_), + out_ptr(out_accessor_), + tmp_ptr(temp_accessor_), + scanParameters(scanParameters_), + accumulator(accumulator_), + inclusive(inclusive_) {} + + template + std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read( + const Input &inpt, Index global_id) const { + return inpt.coeff(global_id); + } + + template + std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read( + const Input &inpt, Index global_id) const { + return inpt[global_id]; + } + + template + std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation( + InclusiveOp inclusive_op) const { + inclusive_op(); + } + + template + std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation( + InclusiveOp) const {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { + Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); + Index tmp = data_offset % scanParameters.panel_threads; + const Index panel_id = data_offset / scanParameters.panel_threads; + const Index group_id = tmp / scanParameters.group_threads; + tmp = tmp % scanParameters.group_threads; + const Index block_id = tmp / scanParameters.block_threads; + const Index local_id = tmp % scanParameters.block_threads; + // we put one element per packet in scratch_mem + const Index scratch_stride = scanParameters.elements_per_block / PacketSize; + const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride; + CoeffReturnType private_scan[ScanParameters::ScanPerThread]; + CoeffReturnType inclusive_scan; + // the actual panel size is scan_size * non_scan_size. + // elements_per_panel is roundup to power of 2 for binary tree + const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; + const Index group_offset = group_id * scanParameters.non_scan_stride; + // This will be effective when the size is bigger than elements_per_block + const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; + const Index thread_offset = (ScanParameters::ScanPerThread * local_id * scanParameters.scan_stride); + const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; + Index next_elements = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + private_scan[i] = ((((block_id * scanParameters.elements_per_block) + + (ScanParameters::ScanPerThread * local_id) + i) < scanParameters.scan_size) && + (global_id < scanParameters.total_size)) + ? read(dev_eval, global_id) + : accumulator.initialize(); + next_elements += scanParameters.scan_stride; + } + first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { + if (inclusive) { + inclusive_scan = private_scan[ScanParameters::ScanPerThread - 1]; + } + }); + // This for loop must be 2 + EIGEN_UNROLL_LOOP + for (int packetIndex = 0; packetIndex < ScanParameters::ScanPerThread; packetIndex += PacketSize) { + Index private_offset = 1; + // build sum in place up the tree + EIGEN_UNROLL_LOOP + for (Index d = PacketSize >> 1; d > 0; d >>= 1) { + EIGEN_UNROLL_LOOP + for (Index l = 0; l < d; l++) { + Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; + Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(private_scan[ai], &accum); + accumulator.reduce(private_scan[bi], &accum); + private_scan[bi] = accumulator.finalize(accum); + } + private_offset *= 2; + } + scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] = + private_scan[PacketSize - 1 + packetIndex]; + private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize(); + // traverse down tree & build scan + EIGEN_UNROLL_LOOP + for (Index d = 1; d < PacketSize; d *= 2) { + private_offset >>= 1; + EIGEN_UNROLL_LOOP + for (Index l = 0; l < d; l++) { + Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; + Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(private_scan[ai], &accum); + accumulator.reduce(private_scan[bi], &accum); + private_scan[ai] = private_scan[bi]; + private_scan[bi] = accumulator.finalize(accum); + } + } + } + + Index offset = 1; + // build sum in place up the tree + for (Index d = scratch_stride >> 1; d > 0; d >>= 1) { + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (local_id < d) { + Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; + Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(scratch[ai], &accum); + accumulator.reduce(scratch[bi], &accum); + scratch[bi] = accumulator.finalize(accum); + } + offset *= 2; + } + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // next step optimisation + if (local_id == 0) { + if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) { + const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) * + scanParameters.non_scan_size + + group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) + + block_id; + tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset]; + } + // clear the last element + scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize(); + } + // traverse down tree & build scan + for (Index d = 1; d < scratch_stride; d *= 2) { + offset >>= 1; + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (local_id < d) { + Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; + Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(scratch[ai], &accum); + accumulator.reduce(scratch[bi], &accum); + scratch[ai] = scratch[bi]; + scratch[bi] = accumulator.finalize(accum); + } + } + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // This for loop must be 2 + EIGEN_UNROLL_LOOP + for (int packetIndex = 0; packetIndex < ScanParameters::ScanPerThread; packetIndex += PacketSize) { + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; i++) { + CoeffReturnType accum = private_scan[packetIndex + i]; + accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum); + private_scan[packetIndex + i] = accumulator.finalize(accum); + } + } + first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { + if (inclusive) { + accumulator.reduce(private_scan[ScanParameters::ScanPerThread - 1], &inclusive_scan); + private_scan[0] = accumulator.finalize(inclusive_scan); + } + }); + next_elements = 0; + // right the first set of private param + EIGEN_UNROLL_LOOP + for (Index i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + if ((((block_id * scanParameters.elements_per_block) + (ScanParameters::ScanPerThread * local_id) + i) < + scanParameters.scan_size) && + (global_id < scanParameters.total_size)) { + Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters::ScanPerThread) * (inclusive)); + out_ptr[global_id] = private_scan[private_id]; + } + next_elements += scanParameters.scan_stride; + } + } // end for loop + } +}; + +template +struct ScanAdjustmentKernelFunctor { + typedef cl::sycl::accessor + LocalAccessor; + static EIGEN_CONSTEXPR int PacketSize = ScanParameters::ScanPerThread / 2; + InAccessor in_ptr; + OutAccessor out_ptr; + const ScanParameters scanParameters; + Op accumulator; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, + OutAccessor out_accessor_, + const ScanParameters scanParameters_, + Op accumulator_) + : in_ptr(in_accessor_), out_ptr(out_accessor_), scanParameters(scanParameters_), accumulator(accumulator_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { + Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); + Index tmp = data_offset % scanParameters.panel_threads; + const Index panel_id = data_offset / scanParameters.panel_threads; + const Index group_id = tmp / scanParameters.group_threads; + tmp = tmp % scanParameters.group_threads; + const Index block_id = tmp / scanParameters.block_threads; + const Index local_id = tmp % scanParameters.block_threads; + + // the actual panel size is scan_size * non_scan_size. + // elements_per_panel is roundup to power of 2 for binary tree + const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; + const Index group_offset = group_id * scanParameters.non_scan_stride; + // This will be effective when the size is bigger than elements_per_block + const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; + const Index thread_offset = ScanParameters::ScanPerThread * local_id * scanParameters.scan_stride; + + const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; + const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block; + const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id; + CoeffReturnType adjust_val = in_ptr[in_id]; + + Index next_elements = 0; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + if ((((block_id * scanParameters.elements_per_block) + (ScanParameters::ScanPerThread * local_id) + i) < + scanParameters.scan_size) && + (global_id < scanParameters.total_size)) { + CoeffReturnType accum = adjust_val; + accumulator.reduce(out_ptr[global_id], &accum); + out_ptr[global_id] = accumulator.finalize(accum); + } + next_elements += scanParameters.scan_stride; + } + } + } +}; + +template +struct ScanInfo { + const Index &total_size; + const Index &scan_size; + const Index &panel_size; + const Index &non_scan_size; + const Index &scan_stride; + const Index &non_scan_stride; + + Index max_elements_per_block; + Index block_size; + Index panel_threads; + Index group_threads; + Index block_threads; + Index elements_per_group; + Index elements_per_block; + Index loop_range; + Index global_range; + Index local_range; + const Eigen::SyclDevice &dev; + EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, + const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, + const Eigen::SyclDevice &dev_) + : total_size(total_size_), + scan_size(scan_size_), + panel_size(panel_size_), + non_scan_size(non_scan_size_), + scan_stride(scan_stride_), + non_scan_stride(non_scan_stride_), + dev(dev_) { + // must be power of 2 + local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()), + Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1)); + + max_elements_per_block = local_range * ScanParameters::ScanPerThread; + + elements_per_group = + dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters::ScanPerThread)), true); + const Index elements_per_panel = elements_per_group * non_scan_size; + elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block)); + panel_threads = elements_per_panel / ScanParameters::ScanPerThread; + group_threads = elements_per_group / ScanParameters::ScanPerThread; + block_threads = elements_per_block / ScanParameters::ScanPerThread; + block_size = elements_per_group / elements_per_block; +#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE + const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE)); +#else + const Index max_threads = panel_threads * panel_size; +#endif + global_range = roundUp(max_threads, local_range); + loop_range = Index( + std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters::ScanPerThread))); + } + inline ScanParameters get_scan_parameter() { + return ScanParameters(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads, + group_threads, block_threads, elements_per_group, elements_per_block, loop_range); + } + inline cl::sycl::nd_range<1> get_thread_range() { + return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + } +}; + +template +struct SYCLAdjustBlockOffset { + EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, + Reducer &accumulator, const Index total_size, + const Index scan_size, const Index panel_size, + const Index non_scan_size, const Index scan_stride, + const Index non_scan_stride, const Eigen::SyclDevice &dev) { + auto scan_info = + ScanInfo(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); + + typedef ScanAdjustmentKernelFunctor + AdjustFuctor; + dev.template unary_kernel_launcher(in_ptr, out_ptr, scan_info.get_thread_range(), + scan_info.max_elements_per_block, + scan_info.get_scan_parameter(), accumulator) + .wait(); + } +}; + +template +struct ScanLauncher_impl { + template + EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, + const Index total_size, const Index scan_size, const Index panel_size, + const Index non_scan_size, const Index scan_stride, + const Index non_scan_stride, const bool inclusive, + const Eigen::SyclDevice &dev) { + auto scan_info = + ScanInfo(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); + const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size; + const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters::ScanPerThread / 2); + CoeffReturnType *temp_pointer = + static_cast(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); + + typedef ScanKernelFunctor ScanFunctor; + dev.template binary_kernel_launcher( + in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size, + scan_info.get_scan_parameter(), accumulator, inclusive) + .wait(); + + if (scan_info.block_size > 1) { + ScanLauncher_impl::scan_block( + tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size, + non_scan_size, Index(1), scan_info.block_size, false, dev); + + SYCLAdjustBlockOffset::adjust_scan_block_offset( + tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, + non_scan_stride, dev); + } + dev.deallocate_temp(temp_pointer); + } +}; + +} // namespace internal +} // namespace TensorSycl +namespace internal { +template +struct ScanLauncher { + typedef typename Self::Index Index; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::Storage Storage; + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + void operator()(Self &self, EvaluatorPointerType data) const { + const Index total_size = internal::array_prod(self.dimensions()); + const Index scan_size = self.size(); + const Index scan_stride = self.stride(); + // this is the scan op (can be sum or ...) + auto accumulator = self.accumulator(); + auto inclusive = !self.exclusive(); + auto consume_dim = self.consume_dim(); + auto dev = self.device(); + + auto dims = self.inner().dimensions(); + + Index non_scan_size = 1; + Index panel_size = 1; + if (static_cast(Self::Layout) == static_cast(ColMajor)) { + for (int i = 0; i < consume_dim; i++) { + non_scan_size *= dims[i]; + } + for (int i = consume_dim + 1; i < Self::NumDims; i++) { + panel_size *= dims[i]; + } + } else { + for (int i = Self::NumDims - 1; i > consume_dim; i--) { + non_scan_size *= dims[i]; + } + for (int i = consume_dim - 1; i >= 0; i--) { + panel_size *= dims[i]; + } + } + const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size; + auto eval_impl = self.inner(); + TensorSycl::internal::ScanLauncher_impl::scan_block( + eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, + inclusive, dev); + } +}; +} // namespace internal +} // namespace Eigen + +#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h new file mode 100644 index 00000000000..11ce0fb4e48 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -0,0 +1,415 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H +#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorShuffling + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor shuffling class. + * + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorShufflingOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorShufflingOp type; +}; + +} // end namespace internal + +template +class TensorShufflingOp : public TensorBase > { + public: + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl) + : m_xpr(expr), m_shuffle(shfl) {} + + EIGEN_DEVICE_FUNC const Shuffle& shufflePermutation() const { return m_shuffle; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp) + + protected: + typename XprType::Nested m_xpr; + const Shuffle m_shuffle; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Self; + typedef TensorShufflingOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_impl(op.expression(), device) { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Shuffle& shuffle = op.shufflePermutation(); + m_is_identity = true; + for (int i = 0; i < NumDims; ++i) { + m_shuffle[i] = static_cast(shuffle[i]); + m_dimensions[i] = input_dims[shuffle[i]]; + m_inverseShuffle[shuffle[i]] = i; + if (m_is_identity && shuffle[i] != i) { + m_is_identity = false; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_unshuffledInputStrides[0] = 1; + m_outputStrides[0] = 1; + + for (int i = 1; i < NumDims; ++i) { + m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = + internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); + } + } else { + m_unshuffledInputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = + internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } + } + + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType Run(const Self& self, Index index) { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + }; + + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType Run(const Self& self, Index index) { + if (self.m_is_identity) { + return self.m_impl.template packet(index); + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + }; + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + return PacketLoader::PacketAccess>::Run(*this, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const int inner_dim = Layout == static_cast(ColMajor) ? 0 : NumDims - 1; + + const size_t target_size = m_device.firstLevelCacheSize(); + const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim; + + // Shuffled inner dimensions leads to a random memory access, which is not + // captured by default cost model bytes loaded/stored. We add this cost + // explicitly. The number of cycles picked based on the benchmarks. + // TODO(ezhulenev): This number was picked based on a very questionable + // benchmarks, add benchmarks that are representative of real workloads. + using BlockRequirements = internal::TensorBlockResourceRequirements; + if (inner_dim_shuffled) { + return BlockRequirements::uniform(target_size).addCostPerCoeff({0, 0, NumDims * 28}); + } else { + return BlockRequirements::skewed(target_size); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + eigen_assert(m_impl.data() != NULL); + + typedef internal::TensorBlockIO TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch, /*allow_strided_storage=*/root_of_expr_ast); + + typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides); + TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset())); + + TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(), block_storage.data()); + + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle); + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = m_is_identity + ? TensorOpCost::AddCost() + : NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index + GetBlockOutputIndex(Index input_index, const DSizes& input_block_strides, + const DSizes& output_block_strides, + const DSizes, NumDims>& fast_input_block_strides) const { + Index output_index = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = input_index / fast_input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * output_block_strides[m_inverseShuffle[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = input_index / fast_input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * output_block_strides[m_inverseShuffle[NumDims - 1]]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[0]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[NumDims - 1]; + } + } + + Dimensions m_dimensions; + bool m_is_identity; + array m_shuffle; + array m_inverseShuffle; // TODO(ezhulenev): Make it int type. + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + array m_unshuffledInputStrides; + + const Device EIGEN_DEVICE_REF m_device; + TensorEvaluator m_impl; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; + + typedef TensorShufflingOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + static constexpr int Layout = TensorEvaluator::Layout; + + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + RawAccess = false + }; + + typedef std::remove_const_t ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index + i) = values[i]; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + + typedef internal::TensorBlockIO TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + const Scalar* block_buffer = block.data(); + + // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen + // expression with coefficient and packet access as `src`. + void* mem = NULL; + if (block_buffer == NULL) { + mem = this->m_device.allocate(desc.size() * sizeof(Scalar)); + ScalarNoConst* buf = static_cast(mem); + + typedef internal::TensorBlockAssignment + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target(desc.dimensions(), internal::strides(desc.dimensions()), buf), + block.expr()); + + block_buffer = buf; + } + + // Read from block. + TensorBlockIOSrc src(internal::strides(desc.dimensions()), block_buffer); + + // Write to the output buffer. + typename TensorBlockIO::Dimensions output_strides(this->m_unshuffledInputStrides); + typename TensorBlockIO::Dimensions output_dimensions; + for (int i = 0; i < NumDims; ++i) { + output_dimensions[this->m_shuffle[i]] = desc.dimension(i); + } + TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(), this->srcCoeff(desc.offset())); + + // Reorder dimensions according to the shuffle. + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map; + for (int i = 0; i < NumDims; ++i) { + dst_to_src_dim_map[i] = static_cast(this->m_inverseShuffle[i]); + } + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + // Deallocate temporary buffer used for the block materialization. + if (mem != NULL) this->m_device.deallocate(mem); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h new file mode 100644 index 00000000000..964797613ae --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -0,0 +1,144 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// Copyright (C) 2014-2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H +#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H + +#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN +#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN; +#else +#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN +#endif + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \internal + * + * \class TensorStorage + * \ingroup CXX11_Tensor_Module + * + * \brief Stores the data of a tensor + * + * This class stores the data of fixed-size, dynamic-size or mixed tensors + * in a way as compact as possible. + * + * \sa Tensor + */ +template +class TensorStorage; + +// Pure fixed-size storage +template +class TensorStorage { + private: + static constexpr std::size_t Size = FixedDimensions::total_size; + + // Allocate an array of size at least one to prevent compiler warnings. + static constexpr std::size_t MinSize = max_n_1::size; + EIGEN_ALIGN_MAX T m_data[MinSize]; + + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStorage() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* data() { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* data() const { return m_data; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const FixedDimensions dimensions() const { return FixedDimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return Size; } +}; + +// pure dynamic +template +class TensorStorage, Options_> { + public: + typedef IndexType Index; + typedef DSizes Dimensions; + typedef TensorStorage, Options_> Self; + + EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() { + if (NumIndices_ == 0) { + m_data = internal::conditional_aligned_new_auto(1); + } + } + EIGEN_DEVICE_FUNC TensorStorage(Index size, const array& dimensions) + : m_data(internal::conditional_aligned_new_auto(size)), m_dimensions(dimensions) { + EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN + } + + template + EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) { + m_data = internal::conditional_aligned_new_auto(internal::array_prod(m_dimensions)); + } + + EIGEN_DEVICE_FUNC TensorStorage(const Self& other) + : m_data(internal::conditional_aligned_new_auto( + internal::array_prod(other.m_dimensions))), + m_dimensions(other.m_dimensions) { + internal::smart_copy(other.m_data, other.m_data + internal::array_prod(other.m_dimensions), m_data); + } + EIGEN_DEVICE_FUNC Self& operator=(const Self& other) { + if (this != &other) { + Self tmp(other); + this->swap(tmp); + } + return *this; + } + + EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage() { *this = std::move(other); } + + EIGEN_DEVICE_FUNC Self& operator=(Self&& other) { + numext::swap(m_data, other.m_data); + numext::swap(m_dimensions, other.m_dimensions); + return *this; + } + + EIGEN_DEVICE_FUNC ~TensorStorage() { + internal::conditional_aligned_delete_auto(m_data, + internal::array_prod(m_dimensions)); + } + EIGEN_DEVICE_FUNC void swap(Self& other) { + numext::swap(m_data, other.m_data); + numext::swap(m_dimensions, other.m_dimensions); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC void resize(Index size, const array& nbDimensions) { + const Index currentSz = internal::array_prod(m_dimensions); + if (size != currentSz) { + internal::conditional_aligned_delete_auto(m_data, currentSz); + if (size) + m_data = internal::conditional_aligned_new_auto(size); + else if (NumIndices_ == 0) { + m_data = internal::conditional_aligned_new_auto(1); + } else + m_data = 0; + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) + } + m_dimensions = nbDimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* data() { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* data() const { return m_data; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + + private: + T* m_data; + Dimensions m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h new file mode 100644 index 00000000000..2924abba062 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -0,0 +1,316 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorStriding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor striding class. + * + * + */ +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorStridingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorStridingOp type; +}; + +} // end namespace internal + +template +class TensorStridingOp : public TensorBase > { + public: + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC const Strides& strides() const { return m_dims; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp) + + protected: + typename XprType::Nested m_xpr; + const Strides m_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorStridingOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = Eigen::numext::ceil(static_cast(m_dimensions[i]) / op.strides()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + m_inputStrides[i - 1] *= op.strides()[i - 1]; + } + m_inputStrides[NumDims - 1] *= op.strides()[NumDims - 1]; + } else { // RowMajor + m_outputStrides[NumDims - 1] = 1; + m_inputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + m_inputStrides[i + 1] *= op.strides()[i + 1]; + } + m_inputStrides[0] *= op.strides()[0]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_impl.coeff(srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + PacketSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + } else { // RowMajor + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[NumDims - 1]; + inputIndices[1] += indices[1] * m_inputStrides[NumDims - 1]; + } + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } else { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[PacketSize - 1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize - 1; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost() + TensorOpCost::MulCost() + + TensorOpCost::DivCost()) + + TensorOpCost::MulCost(); + if (vectorized) { + compute_cost *= 2; // packet() computes two indices + } + const int innerDim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : (NumDims - 1); + return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) + + // Computation is not vectorized per se, but it is done once per packet. + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[0]; + } else { // RowMajor + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[NumDims - 1]; + } + return inputIndex; + } + + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorStridingOp XprType; + typedef TensorEvaluator Base; + // typedef typename XprType::Index Index; + static constexpr int NumDims = internal::array_size::Dimensions>::value; + // typedef DSizes Dimensions; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = TensorEvaluator::PacketAccess, + PreferBlockAccess = false, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < this->dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + PacketSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[0]; + inputIndices[1] += indices[1] * this->m_inputStrides[0]; + } else { // RowMajor + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[NumDims - 1]; + inputIndices[1] += indices[1] * this->m_inputStrides[NumDims - 1]; + } + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + this->m_impl.template writePacket(inputIndices[0], x); + } else { + EIGEN_ALIGN_MAX Scalar values[PacketSize]; + internal::pstore(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize - 1]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize - 1; ++i) { + this->coeffRef(index + i) = values[i]; + } + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h new file mode 100644 index 00000000000..035d94e540f --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -0,0 +1,278 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gagan Goel +// Copyright (C) 2017 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorTrace + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor Trace class. + * + * + */ + +namespace internal { +template +struct traits > : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions - array_size::value; + static constexpr int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorTraceOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorTraceOp type; +}; + +} // end namespace internal + +template +class TensorTraceOp : public TensorBase > { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t& expression() const { + return m_xpr; + } + + protected: + typename XprType::Nested m_xpr; + const Dims m_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorTraceOp XprType; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + static constexpr int NumReducedDims = internal::array_size::value; + static constexpr int NumOutputDims = NumInputDims - NumReducedDims; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_traceDim(1), m_device(device) { + EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + for (int i = 0; i < NumInputDims; ++i) { + m_reduced[i] = false; + } + + const Dims& op_dims = op.dims(); + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op_dims[i] >= 0); + eigen_assert(op_dims[i] < NumInputDims); + m_reduced[op_dims[i]] = true; + } + + // All the dimensions should be distinct to compute the trace + int num_distinct_reduce_dims = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + ++num_distinct_reduce_dims; + } + } + + EIGEN_ONLY_USED_FOR_DEBUG(num_distinct_reduce_dims); + eigen_assert(num_distinct_reduce_dims == NumReducedDims); + + // Compute the dimensions of the result. + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + int output_index = 0; + int reduced_index = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedDims[reduced_index] = input_dims[i]; + if (reduced_index > 0) { + // All the trace dimensions must have the same size + eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]); + } + ++reduced_index; + } else { + m_dimensions[output_index] = input_dims[i]; + ++output_index; + } + } + + if (NumReducedDims != 0) { + m_traceDim = m_reducedDims[0]; + } + + // Compute the output strides + if (NumOutputDims > 0) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_outputStrides.back() = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + } + + // Compute the input strides + if (NumInputDims > 0) { + array input_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i - 1] * input_dims[i - 1]; + } + } else { + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + output_index = 0; + reduced_index = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedStrides[reduced_index] = input_strides[i]; + ++reduced_index; + } else { + m_preservedStrides[output_index] = input_strides[i]; + ++output_index; + } + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + // Initialize the result + CoeffReturnType result = internal::cast(0); + Index index_stride = 0; + for (int i = 0; i < NumReducedDims; ++i) { + index_stride += m_reducedStrides[i]; + } + + // If trace is requested along all dimensions, starting index would be 0 + Index cur_index = 0; + if (NumOutputDims != 0) cur_index = firstInput(index); + for (Index i = 0; i < m_traceDim; ++i) { + result += m_impl.coeff(cur_index); + cur_index += index_stride; + } + + return result; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType result = internal::ploadt(values); + return result; + } + + protected: + // Given the output index, finds the first index in the input tensor used to compute the trace + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[0]; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[NumOutputDims - 1]; + } + return startInput; + } + + Dimensions m_dimensions; + TensorEvaluator m_impl; + // Initialize the size of the trace dimension + Index m_traceDim; + const Device EIGEN_DEVICE_REF m_device; + array m_reduced; + array m_reducedDims; + array m_outputStrides; + array m_reducedStrides; + array m_preservedStrides; +}; + +} // End namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h new file mode 100644 index 00000000000..017b4ff6340 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -0,0 +1,231 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +template +class compute_tensor_flags { + enum { + is_dynamic_size_storage = 1, + + is_aligned = (((Options & DontAlign) == 0) && ( +#if EIGEN_MAX_STATIC_ALIGN_BYTES > 0 + (!is_dynamic_size_storage) +#else + 0 +#endif + | +#if EIGEN_MAX_ALIGN_BYTES > 0 + is_dynamic_size_storage +#else + 0 +#endif + )), + packet_access_bit = packet_traits::Vectorizable && is_aligned ? PacketAccessBit : 0 + }; + + public: + enum { ret = packet_access_bit }; +}; + +template +struct traits > { + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef IndexType_ Index; + static constexpr int NumDimensions = NumIndices_; + static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret | (is_const::value ? 0 : LvalueBit) + }; + template + struct MakePointer { + typedef T* Type; + }; + typedef typename MakePointer::Type PointerType; +}; + +template +struct traits > { + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef IndexType_ Index; + static constexpr int NumDimensions = array_size::value; + static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret | (is_const::value ? 0 : LvalueBit) + }; + template + struct MakePointer { + typedef T* Type; + }; + typedef typename MakePointer::Type PointerType; +}; + +template class MakePointer_> +struct traits > : public traits { + typedef traits BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + static constexpr int NumDimensions = BaseTraits::NumDimensions; + static constexpr int Layout = BaseTraits::Layout; + enum { Options = Options_, Flags = BaseTraits::Flags }; + template + struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; + typedef typename MakePointer::Type PointerType; +}; + +template +struct traits > : public traits { + typedef traits BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + static constexpr int NumDimensions = BaseTraits::NumDimensions; + static constexpr int Layout = BaseTraits::Layout; + enum { Options = BaseTraits::Options, Flags = BaseTraits::Flags }; + typedef typename BaseTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const Tensor EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> { + typedef const Tensor EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorFixedSize EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorFixedSize EIGEN_DEVICE_REF type; +}; + +template class MakePointer> +struct eval, Eigen::Dense> { + typedef const TensorMap EIGEN_DEVICE_REF type; +}; + +template class MakePointer> +struct eval, Eigen::Dense> { + typedef const TensorMap EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorRef EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorRef EIGEN_DEVICE_REF type; +}; + +// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector. +template +struct nested { + typedef typename ref_selector::type type; +}; + +template +struct nested > { + typedef const Tensor EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef const Tensor EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef const TensorFixedSize EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef const TensorFixedSize EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef const TensorRef EIGEN_DEVICE_REF type; +}; + +template +struct nested > { + typedef const TensorRef EIGEN_DEVICE_REF type; +}; + +} // end namespace internal + +// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C, +// R, B), and convolve it with a set of filters, which can also be presented as +// a tensor (D, K, K, M), where M is the number of filters, K is the filter +// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For +// simplicity we assume that we always use square filters (which is usually the +// case in images), hence the two Ks in the tensor dimension. It also takes in +// a few additional parameters: +// Stride (S): The convolution stride is the offset between locations where we +// apply the filters. A larger stride means that the output will be +// spatially smaller. +// Padding (P): The padding we apply to the input tensor along the R and C +// dimensions. This is usually used to make sure that the spatial +// dimensions of the output matches our intention. +// +// Two types of padding are often used: +// SAME: The pad value is computed so that the output will have size +// R/S and C/S. +// VALID: no padding is carried out. +// When we do padding, the padded values at the padded locations are usually +// zero. +// +// The output dimensions for convolution, when given all the parameters above, +// are as follows: +// When Padding = SAME: the output size is (B, R', C', M), where +// R' = ceil(float(R) / float(S)) +// C' = ceil(float(C) / float(S)) +// where ceil is the ceiling function. The input tensor is padded with 0 as +// needed. The number of padded rows and columns are computed as: +// Pr = ((R' - 1) * S + K - R) / 2 +// Pc = ((C' - 1) * S + K - C) / 2 +// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2. +// This is where SAME comes from - the output has the same size as the input has. +// When Padding = VALID: the output size is computed as +// R' = ceil(float(R - K + 1) / float(S)) +// C' = ceil(float(C - K + 1) / float(S)) +// and the number of padded rows and columns are computed in the same way as in +// the SAME case. +// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0, +// Pc=0. +enum PaddingType { PADDING_VALID = 1, PADDING_SAME = 2 }; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h new file mode 100644 index 00000000000..99e51c57718 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -0,0 +1,229 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H +#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +template +struct static_val { + static const uint64_t value = n; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() {} + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { + EIGEN_UNUSED_VARIABLE(v); + eigen_assert(v == n); + } +}; + +template +struct TensorUInt128 { + HIGH high; + LOW low; + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(const TensorUInt128& other) + : high(other.high), low(other.low) { + EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128& operator=(const TensorUInt128& other) { + EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); + high = other.high; + low = other.low; + return *this; + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) { + eigen_assert( + (static_cast>(x) <= NumTraits::highest())); + eigen_assert(x >= 0); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(HIGH y, LOW x) : high(y), low(x) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const { return low; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const { return low; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const { return high; } +}; + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator==(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + return (lhs.high == rhs.high) && (lhs.low == rhs.low); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator!=(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + return (lhs.high != rhs.high) || (lhs.low != rhs.low); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator>=(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + if (lhs.high != rhs.high) { + return lhs.high > rhs.high; + } + return lhs.low >= rhs.low; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator<(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + if (lhs.high != rhs.high) { + return lhs.high < rhs.high; + } + return lhs.low < rhs.low; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128 operator+(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + TensorUInt128 result(lhs.high + rhs.high, lhs.low + rhs.low); + if (result.low < rhs.low) { + result.high += 1; + } + return result; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128 operator-(const TensorUInt128& lhs, + const TensorUInt128& rhs) { + TensorUInt128 result(lhs.high - rhs.high, lhs.low - rhs.low); + if (result.low > lhs.low) { + result.high -= 1; + } + return result; +} + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorUInt128 operator*( + const TensorUInt128& lhs, const TensorUInt128& rhs) { + // Split each 128-bit integer into 4 32-bit integers, and then do the + // multiplications by hand as follow: + // lhs a b c d + // rhs e f g h + // ----------- + // ah bh ch dh + // bg cg dg + // cf df + // de + // The result is stored in 2 64bit integers, high and low. + + const uint64_t LOW = 0x00000000FFFFFFFFLL; + const uint64_t HIGH = 0xFFFFFFFF00000000LL; + + uint64_t d = lhs.low & LOW; + uint64_t c = (lhs.low & HIGH) >> 32LL; + uint64_t b = lhs.high & LOW; + uint64_t a = (lhs.high & HIGH) >> 32LL; + + uint64_t h = rhs.low & LOW; + uint64_t g = (rhs.low & HIGH) >> 32LL; + uint64_t f = rhs.high & LOW; + uint64_t e = (rhs.high & HIGH) >> 32LL; + + // Compute the low 32 bits of low + uint64_t acc = d * h; + uint64_t low = acc & LOW; + // Compute the high 32 bits of low. Add a carry every time we wrap around + acc >>= 32LL; + uint64_t carry = 0; + uint64_t acc2 = acc + c * h; + if (acc2 < acc) { + carry++; + } + acc = acc2 + d * g; + if (acc < acc2) { + carry++; + } + low |= (acc << 32LL); + + // Carry forward the high bits of acc to initiate the computation of the + // low 32 bits of high + acc2 = (acc >> 32LL) | (carry << 32LL); + carry = 0; + + acc = acc2 + b * h; + if (acc < acc2) { + carry++; + } + acc2 = acc + c * g; + if (acc2 < acc) { + carry++; + } + acc = acc2 + d * f; + if (acc < acc2) { + carry++; + } + uint64_t high = acc & LOW; + + // Start to compute the high 32 bits of high. + acc2 = (acc >> 32LL) | (carry << 32LL); + + acc = acc2 + a * h; + acc2 = acc + b * g; + acc = acc2 + c * f; + acc2 = acc + d * e; + high |= (acc2 << 32LL); + + return TensorUInt128(high, low); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorUInt128 operator/( + const TensorUInt128& lhs, const TensorUInt128& rhs) { + if (rhs == TensorUInt128, static_val<1>>(1)) { + return TensorUInt128(lhs.high, lhs.low); + } else if (lhs < rhs) { + return TensorUInt128(0); + } else { + // calculate the biggest power of 2 times rhs that's less than or equal to lhs + TensorUInt128 power2(1); + TensorUInt128 d(rhs); + TensorUInt128 tmp(lhs - d); + while (lhs >= d) { + tmp = tmp - d; + d = d + d; + power2 = power2 + power2; + } + + tmp = TensorUInt128(lhs.high, lhs.low); + TensorUInt128 result(0); + while (power2 != TensorUInt128, static_val<0>>(0)) { + if (tmp >= d) { + tmp = tmp - d; + result = result + power2; + } + // Shift right + power2 = TensorUInt128(power2.high >> 1, (power2.low >> 1) | (power2.high << 63)); + d = TensorUInt128(d.high >> 1, (d.low >> 1) | (d.high << 63)); + } + + return result; + } +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h new file mode 100644 index 00000000000..2f0c1354167 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -0,0 +1,619 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +/** \class TensorVolumePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for processing of volumetric data. + * This assumes that the input has a least 4 dimensions ordered as follows: + * - channels + * - planes + * - rows + * - columns + * - (optional) additional dimensions such as time or batch size. + * Calling the volume patch code with patch_planes, patch_rows, and patch_cols + * is equivalent to calling the regular patch extraction code with parameters + * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional + * dimensions. + */ +namespace internal { + +template +struct traits > : public traits { + typedef std::remove_const_t Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef std::remove_reference_t Nested_; + static constexpr int NumDimensions = XprTraits::NumDimensions + 1; + static constexpr int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorVolumePatchOp& type; +}; + +template +struct nested, 1, + typename eval >::type> { + typedef TensorVolumePatchOp type; +}; + +} // end namespace internal + +template +class TensorVolumePatchOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp( + const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, DenseIndex in_plane_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, DenseIndex plane_inflate_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, PaddingType padding_type, Scalar padding_value) + : m_xpr(expr), + m_patch_planes(patch_planes), + m_patch_rows(patch_rows), + m_patch_cols(patch_cols), + m_plane_strides(plane_strides), + m_row_strides(row_strides), + m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), + m_in_row_strides(in_row_strides), + m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), + m_row_inflate_strides(row_inflate_strides), + m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), + m_padding_top_z(0), + m_padding_bottom_z(0), + m_padding_top(0), + m_padding_bottom(0), + m_padding_left(0), + m_padding_right(0), + m_padding_type(padding_type), + m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp( + const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, DenseIndex in_plane_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, DenseIndex plane_inflate_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, DenseIndex padding_top_z, + DenseIndex padding_bottom_z, DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left, + DenseIndex padding_right, Scalar padding_value) + : m_xpr(expr), + m_patch_planes(patch_planes), + m_patch_rows(patch_rows), + m_patch_cols(patch_cols), + m_plane_strides(plane_strides), + m_row_strides(row_strides), + m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), + m_in_row_strides(in_row_strides), + m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), + m_row_inflate_strides(row_inflate_strides), + m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), + m_padding_top_z(padding_top_z), + m_padding_bottom_z(padding_bottom_z), + m_padding_top(padding_top), + m_padding_bottom(padding_bottom), + m_padding_left(padding_left), + m_padding_right(padding_right), + m_padding_type(PADDING_VALID), + m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC DenseIndex patch_planes() const { return m_patch_planes; } + EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC DenseIndex plane_strides() const { return m_plane_strides; } + EIGEN_DEVICE_FUNC DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC DenseIndex in_plane_strides() const { return m_in_plane_strides; } + EIGEN_DEVICE_FUNC DenseIndex in_row_strides() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC DenseIndex in_col_strides() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; } + EIGEN_DEVICE_FUNC DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC bool padding_explicit() const { return m_padding_explicit; } + EIGEN_DEVICE_FUNC DenseIndex padding_top_z() const { return m_padding_top_z; } + EIGEN_DEVICE_FUNC DenseIndex padding_bottom_z() const { return m_padding_bottom_z; } + EIGEN_DEVICE_FUNC DenseIndex padding_top() const { return m_padding_top; } + EIGEN_DEVICE_FUNC DenseIndex padding_bottom() const { return m_padding_bottom; } + EIGEN_DEVICE_FUNC DenseIndex padding_left() const { return m_padding_left; } + EIGEN_DEVICE_FUNC DenseIndex padding_right() const { return m_padding_right; } + EIGEN_DEVICE_FUNC PaddingType padding_type() const { return m_padding_type; } + EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC const internal::remove_all_t& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_planes; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_plane_strides; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const DenseIndex m_in_plane_strides; + const DenseIndex m_in_row_strides; + const DenseIndex m_in_col_strides; + const DenseIndex m_plane_inflate_strides; + const DenseIndex m_row_inflate_strides; + const DenseIndex m_col_inflate_strides; + const bool m_padding_explicit; + const DenseIndex m_padding_top_z; + const DenseIndex m_padding_bottom_z; + const DenseIndex m_padding_top; + const DenseIndex m_padding_bottom; + const DenseIndex m_padding_left; + const DenseIndex m_padding_right; + const PaddingType m_padding_type; + const Scalar m_padding_value; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorVolumePatchOp XprType; + typedef typename XprType::Index Index; + static constexpr int NumInputDims = + internal::array_size::Dimensions>::value; + static constexpr int NumDims = NumInputDims + 1; + typedef DSizes Dimensions; + typedef std::remove_const_t Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static constexpr int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static constexpr int Layout = TensorEvaluator::Layout; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); + + m_paddingValue = op.padding_value(); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // Cache a few variables. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputDepth = input_dims[0]; + m_inputPlanes = input_dims[1]; + m_inputRows = input_dims[2]; + m_inputCols = input_dims[3]; + } else { + m_inputDepth = input_dims[NumInputDims - 1]; + m_inputPlanes = input_dims[NumInputDims - 2]; + m_inputRows = input_dims[NumInputDims - 3]; + m_inputCols = input_dims[NumInputDims - 4]; + } + + m_plane_strides = op.plane_strides(); + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // Input strides and effective input/patch size + m_in_plane_strides = op.in_plane_strides(); + m_in_row_strides = op.in_row_strides(); + m_in_col_strides = op.in_col_strides(); + m_plane_inflate_strides = op.plane_inflate_strides(); + m_row_inflate_strides = op.row_inflate_strides(); + m_col_inflate_strides = op.col_inflate_strides(); + + // The "effective" spatial size after inflating data with zeros. + m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1; + m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; + m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; + m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1); + m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); + m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); + + if (op.padding_explicit()) { + m_outputPlanes = + numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / + static_cast(m_plane_strides)); + m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / + static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / + static_cast(m_col_strides)); + m_planePaddingTop = op.padding_top_z(); + m_rowPaddingTop = op.padding_top(); + m_colPaddingLeft = op.padding_left(); + } else { + // Computing padding from the type + switch (op.padding_type()) { + case PADDING_VALID: + m_outputPlanes = + numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast(m_plane_strides)); + m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + m_planePaddingTop = 0; + m_rowPaddingTop = 0; + m_colPaddingLeft = 0; + break; + case PADDING_SAME: { + m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast(m_plane_strides)); + m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); + m_outputCols = numext::ceil(m_input_cols_eff / static_cast(m_col_strides)); + const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff; + const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff; + const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff; + m_planePaddingTop = dz / 2; + m_rowPaddingTop = dy / 2; + m_colPaddingLeft = dx / 2; + break; + } + default: { + eigen_assert(false && "unexpected padding"); + return; + } + } + } + eigen_assert(m_outputRows > 0); + eigen_assert(m_outputCols > 0); + eigen_assert(m_outputPlanes > 0); + + // Dimensions for result of extraction. + if (static_cast(Layout) == static_cast(ColMajor)) { + // ColMajor + // 0: depth + // 1: patch_planes + // 2: patch_rows + // 3: patch_cols + // 4: number of patches + // 5 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_planes(); + m_dimensions[2] = op.patch_rows(); + m_dimensions[3] = op.patch_cols(); + m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols; + for (int i = 5; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i - 1]; + } + } else { + // RowMajor + // NumDims-1: depth + // NumDims-2: patch_planes + // NumDims-3: patch_rows + // NumDims-4: patch_cols + // NumDims-5: number of patches + // NumDims-6 and beyond: anything else (such as batch). + m_dimensions[NumDims - 1] = input_dims[NumInputDims - 1]; + m_dimensions[NumDims - 2] = op.patch_planes(); + m_dimensions[NumDims - 3] = op.patch_rows(); + m_dimensions[NumDims - 4] = op.patch_cols(); + m_dimensions[NumDims - 5] = m_outputPlanes * m_outputRows * m_outputCols; + for (int i = NumDims - 6; i >= 0; --i) { + m_dimensions[i] = input_dims[i]; + } + } + + // Strides for the output tensor. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_rowStride = m_dimensions[1]; + m_colStride = m_dimensions[2] * m_rowStride; + m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[4]; + } else { + m_rowStride = m_dimensions[NumDims - 2]; + m_colStride = m_dimensions[NumDims - 3] * m_rowStride; + m_patchStride = m_colStride * m_dimensions[NumDims - 4] * m_dimensions[NumDims - 1]; + m_otherStride = m_patchStride * m_dimensions[NumDims - 5]; + } + + // Strides for navigating through the input tensor. + m_planeInputStride = m_inputDepth; + m_rowInputStride = m_inputDepth * m_inputPlanes; + m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes; + m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes; + + m_outputPlanesRows = m_outputPlanes * m_outputRows; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastRowStride = internal::TensorIntDivisor(m_rowStride); + m_fastInputRowStride = internal::TensorIntDivisor(m_row_inflate_strides); + m_fastInputColStride = internal::TensorIntDivisor(m_col_inflate_strides); + m_fastInputPlaneStride = internal::TensorIntDivisor(m_plane_inflate_strides); + m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); + m_fastOutputPlanes = internal::TensorIntDivisor(m_outputPlanes); + m_fastOutputPlanesRows = internal::TensorIntDivisor(m_outputPlanesRows); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[0]); + } else { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims - 1]); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + + // Spatial offset within the patch. This has to be translated into 3D + // coordinates within the patch. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; + + // Batch, etc. + const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride; + const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + // Calculate column index in the input original tensor. + const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = + (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate row index in the original input tensor. + const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; + const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = + (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate plane index in the original input tensor. + const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex)); + const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride; + const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop; + const Index origInputPlane = + (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0); + if (inputPlane < 0 || inputPlane >= m_input_planes_eff || + ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + + const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + + origInputPlane * m_planeInputStride + otherIndex * m_otherInputStride; + + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 || + m_in_plane_strides != 1 || m_plane_inflate_strides != 1) { + return packetWithPossibleZero(index); + } + + const Index indices[2] = {index, index + PacketSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + + const Index patch3DIndex = + (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft, + colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputCols[0] != inputCols[1]) { + return packetWithPossibleZero(index); + } + + const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; + const Index rowOffsets[2] = {(patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride, + (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop, + rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputRows[0] != inputRows[1]) { + return packetWithPossibleZero(index); + } + + const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex)); + const Index planeOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride, + patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride}; + eigen_assert(planeOffsets[0] <= planeOffsets[1]); + const Index inputPlanes[2] = {planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop, + planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop}; + + if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) { + // no padding + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + + m_planeInputStride * inputPlanes[0] + otherIndex * m_otherInputStride; + return m_impl.template packet(inputIndex); + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = + 10 * TensorOpCost::DivCost() + 21 * TensorOpCost::MulCost() + 8 * TensorOpCost::AddCost(); + return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + const TensorEvaluator& impl() const { return m_impl; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { + EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + + // Parameters passed to the constructor. + Index m_plane_strides; + Index m_row_strides; + Index m_col_strides; + + Index m_outputPlanes; + Index m_outputRows; + Index m_outputCols; + + Index m_planePaddingTop; + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + Index m_in_plane_strides; + Index m_in_row_strides; + Index m_in_col_strides; + + Index m_plane_inflate_strides; + Index m_row_inflate_strides; + Index m_col_inflate_strides; + + // Cached input size. + Index m_inputDepth; + Index m_inputPlanes; + Index m_inputRows; + Index m_inputCols; + + // Other cached variables. + Index m_outputPlanesRows; + + // Effective input/patch post-inflation size. + Index m_input_planes_eff; + Index m_input_rows_eff; + Index m_input_cols_eff; + Index m_patch_planes_eff; + Index m_patch_rows_eff; + Index m_patch_cols_eff; + + // Strides for the output tensor. + Index m_otherStride; + Index m_patchStride; + Index m_rowStride; + Index m_colStride; + + // Strides for the input tensor. + Index m_planeInputStride; + Index m_rowInputStride; + Index m_colInputStride; + Index m_otherInputStride; + + internal::TensorIntDivisor m_fastOtherStride; + internal::TensorIntDivisor m_fastPatchStride; + internal::TensorIntDivisor m_fastColStride; + internal::TensorIntDivisor m_fastRowStride; + internal::TensorIntDivisor m_fastInputPlaneStride; + internal::TensorIntDivisor m_fastInputRowStride; + internal::TensorIntDivisor m_fastInputColStride; + internal::TensorIntDivisor m_fastInputColsEff; + internal::TensorIntDivisor m_fastOutputPlanesRows; + internal::TensorIntDivisor m_fastOutputPlanes; + internal::TensorIntDivisor m_fastOutputDepth; + + Scalar m_paddingValue; + + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h new file mode 100644 index 00000000000..51c0ad6658a --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -0,0 +1,296 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +class DynamicSGroup { + public: + inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { + m_elements.push_back(ge(Generator(0, 0, 0))); + } + inline DynamicSGroup(const DynamicSGroup& o) + : m_numIndices(o.m_numIndices), + m_elements(o.m_elements), + m_generators(o.m_generators), + m_globalFlags(o.m_globalFlags) {} + inline DynamicSGroup(DynamicSGroup&& o) + : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { + std::swap(m_elements, o.m_elements); + } + inline DynamicSGroup& operator=(const DynamicSGroup& o) { + m_numIndices = o.m_numIndices; + m_elements = o.m_elements; + m_generators = o.m_generators; + m_globalFlags = o.m_globalFlags; + return *this; + } + inline DynamicSGroup& operator=(DynamicSGroup&& o) { + m_numIndices = o.m_numIndices; + std::swap(m_elements, o.m_elements); + m_generators = o.m_generators; + m_globalFlags = o.m_globalFlags; + return *this; + } + + void add(int one, int two, int flags = 0); + + template + inline void add(Gen_) { + add(Gen_::One, Gen_::Two, Gen_::Flags); + } + inline void addSymmetry(int one, int two) { add(one, two, 0); } + inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); } + inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); } + inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); } + + template + inline RV apply(const std::array& idx, RV initial, Args&&... args) const { + eigen_assert(N >= m_numIndices && + "Can only apply symmetry group to objects that have at least the required amount of indices."); + for (std::size_t i = 0; i < size(); i++) + initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list::type()), m_elements[i].flags, + initial, std::forward(args)...); + return initial; + } + + template + inline RV apply(const std::vector& idx, RV initial, Args&&... args) const { + eigen_assert(idx.size() >= m_numIndices && + "Can only apply symmetry group to objects that have at least the required amount of indices."); + for (std::size_t i = 0; i < size(); i++) + initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward(args)...); + return initial; + } + + inline int globalFlags() const { return m_globalFlags; } + inline std::size_t size() const { return m_elements.size(); } + + template + inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, + typename Tensor_::Index firstIndex, + IndexTypes... otherIndices) const { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, + "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + } + + template + inline internal::tensor_symmetry_value_setter operator()( + Tensor_& tensor, std::array const& indices) const { + return internal::tensor_symmetry_value_setter(tensor, *this, indices); + } + + private: + struct GroupElement { + std::vector representation; + int flags; + bool isId() const { + for (std::size_t i = 0; i < representation.size(); i++) + if (i != (size_t)representation[i]) return false; + return true; + } + }; + struct Generator { + int one; + int two; + int flags; + constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {} + }; + + std::size_t m_numIndices; + std::vector m_elements; + std::vector m_generators; + int m_globalFlags; + + template + inline std::array h_permute(std::size_t which, const std::array& idx, + internal::numeric_list) const { + return std::array{{idx[n >= m_numIndices ? n : m_elements[which].representation[n]]...}}; + } + + template + inline std::vector h_permute(std::size_t which, std::vector idx) const { + std::vector result; + result.reserve(idx.size()); + for (auto k : m_elements[which].representation) result.push_back(idx[k]); + for (std::size_t i = m_numIndices; i < idx.size(); i++) result.push_back(idx[i]); + return result; + } + + inline GroupElement ge(Generator const& g) const { + GroupElement result; + result.representation.reserve(m_numIndices); + result.flags = g.flags; + for (std::size_t k = 0; k < m_numIndices; k++) { + if (k == (std::size_t)g.one) + result.representation.push_back(g.two); + else if (k == (std::size_t)g.two) + result.representation.push_back(g.one); + else + result.representation.push_back(int(k)); + } + return result; + } + + GroupElement mul(GroupElement, GroupElement) const; + inline GroupElement mul(Generator g1, GroupElement g2) const { return mul(ge(g1), g2); } + + inline GroupElement mul(GroupElement g1, Generator g2) const { return mul(g1, ge(g2)); } + + inline GroupElement mul(Generator g1, Generator g2) const { return mul(ge(g1), ge(g2)); } + + inline int findElement(GroupElement e) const { + for (auto ee : m_elements) { + if (ee.representation == e.representation) return ee.flags ^ e.flags; + } + return -1; + } + + void updateGlobalFlags(int flagDiffOfSameGenerator); +}; + +// dynamic symmetry group that auto-adds the template parameters in the constructor +template +class DynamicSGroupFromTemplateArgs : public DynamicSGroup { + public: + inline DynamicSGroupFromTemplateArgs() : DynamicSGroup() { add_all(internal::type_list()); } + inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) {} + inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) {} + inline DynamicSGroupFromTemplateArgs& operator=(const DynamicSGroupFromTemplateArgs& o) { + DynamicSGroup::operator=(o); + return *this; + } + inline DynamicSGroupFromTemplateArgs& operator=(DynamicSGroupFromTemplateArgs&& o) { + DynamicSGroup::operator=(o); + return *this; + } + + private: + template + inline void add_all(internal::type_list) { + add(Gen1()); + add_all(internal::type_list()); + } + + inline void add_all(internal::type_list<>) {} +}; + +inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const { + eigen_internal_assert(g1.representation.size() == m_numIndices); + eigen_internal_assert(g2.representation.size() == m_numIndices); + + GroupElement result; + result.representation.reserve(m_numIndices); + for (std::size_t i = 0; i < m_numIndices; i++) { + int v = g2.representation[g1.representation[i]]; + eigen_assert(v >= 0); + result.representation.push_back(v); + } + result.flags = g1.flags ^ g2.flags; + return result; +} + +inline void DynamicSGroup::add(int one, int two, int flags) { + eigen_assert(one >= 0); + eigen_assert(two >= 0); + eigen_assert(one != two); + + if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) { + std::size_t newNumIndices = (one > two) ? one : two + 1; + for (auto& gelem : m_elements) { + gelem.representation.reserve(newNumIndices); + for (std::size_t i = m_numIndices; i < newNumIndices; i++) gelem.representation.push_back(i); + } + m_numIndices = newNumIndices; + } + + Generator g{one, two, flags}; + GroupElement e = ge(g); + + /* special case for first generator */ + if (m_elements.size() == 1) { + while (!e.isId()) { + m_elements.push_back(e); + e = mul(e, g); + } + + if (e.flags > 0) updateGlobalFlags(e.flags); + + // only add in case we didn't have identity + if (m_elements.size() > 1) m_generators.push_back(g); + return; + } + + int p = findElement(e); + if (p >= 0) { + updateGlobalFlags(p); + return; + } + + std::size_t coset_order = m_elements.size(); + m_elements.push_back(e); + for (std::size_t i = 1; i < coset_order; i++) m_elements.push_back(mul(m_elements[i], e)); + m_generators.push_back(g); + + std::size_t coset_rep = coset_order; + do { + for (auto g : m_generators) { + e = mul(m_elements[coset_rep], g); + p = findElement(e); + if (p < 0) { + // element not yet in group + m_elements.push_back(e); + for (std::size_t i = 1; i < coset_order; i++) m_elements.push_back(mul(m_elements[i], e)); + } else if (p > 0) { + updateGlobalFlags(p); + } + } + coset_rep += coset_order; + } while (coset_rep < m_elements.size()); +} + +inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator) { + switch (flagDiffOfSameGenerator) { + case 0: + default: + // nothing happened + break; + case NegationFlag: + // every element is it's own negative => whole tensor is zero + m_globalFlags |= GlobalZeroFlag; + break; + case ConjugationFlag: + // every element is it's own conjugate => whole tensor is real + m_globalFlags |= GlobalRealFlag; + break; + case (NegationFlag | ConjugationFlag): + // every element is it's own negative conjugate => whole tensor is imaginary + m_globalFlags |= GlobalImagFlag; + break; + /* NOTE: + * since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator + * causes the tensor to be real and the next one to be imaginary, this will + * trivially give the correct result + */ + } +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h new file mode 100644 index 00000000000..b1b2e14c6e6 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h @@ -0,0 +1,4 @@ +#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H +#error \ + "Please include unsupported/Eigen/CXX11/TensorSymmetry instead of including headers inside the src directory directly." +#endif diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h new file mode 100644 index 00000000000..3f9bb51e716 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -0,0 +1,223 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +template +struct tensor_static_symgroup_permutate; + +template +struct tensor_static_symgroup_permutate> { + constexpr static std::size_t N = sizeof...(nn); + + template + constexpr static inline std::array run(const std::array& indices) { + return {{indices[nn]...}}; + } +}; + +template +struct tensor_static_symgroup_element { + typedef indices_ indices; + constexpr static int flags = flags_; +}; + +template +struct tensor_static_symgroup_element_ctor { + typedef tensor_static_symgroup_element::type, + Gen::Flags> + type; +}; + +template +struct tensor_static_symgroup_identity_ctor { + typedef tensor_static_symgroup_element::type, 0> type; +}; + +template +struct tensor_static_symgroup_multiply_helper { + template + constexpr static inline numeric_list::value...> helper(numeric_list) { + return numeric_list::value...>(); + } +}; + +template +struct tensor_static_symgroup_multiply { + private: + typedef typename A::indices iia; + typedef typename B::indices iib; + constexpr static int ffa = A::flags; + constexpr static int ffb = B::flags; + + public: + static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices."); + + typedef tensor_static_symgroup_element::helper(iia())), + ffa ^ ffb> + type; +}; + +template +struct tensor_static_symgroup_equality { + typedef typename A::indices iia; + typedef typename B::indices iib; + constexpr static int ffa = A::flags; + constexpr static int ffb = B::flags; + static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices."); + + constexpr static bool value = is_same::value; + + private: + /* this should be zero if they are identical, or else the tensor + * will be forced to be pure real, pure imaginary or even pure zero + */ + constexpr static int flags_cmp_ = ffa ^ ffb; + + /* either they are not equal, then we don't care whether the flags + * match, or they are equal, and then we have to check + */ + constexpr static bool is_zero = value && flags_cmp_ == NegationFlag; + constexpr static bool is_real = value && flags_cmp_ == ConjugationFlag; + constexpr static bool is_imag = value && flags_cmp_ == (NegationFlag | ConjugationFlag); + + public: + constexpr static int global_flags = + (is_real ? GlobalRealFlag : 0) | (is_imag ? GlobalImagFlag : 0) | (is_zero ? GlobalZeroFlag : 0); +}; + +template +struct tensor_static_symgroup { + typedef StaticSGroup type; + constexpr static std::size_t size = type::static_size; +}; + +template +constexpr static inline std::array tensor_static_symgroup_index_permute(std::array idx, + internal::numeric_list, + internal::numeric_list) { + return {{idx[ii]..., idx[jj]...}}; +} + +template +static inline std::vector tensor_static_symgroup_index_permute(std::vector idx, + internal::numeric_list) { + std::vector result{{idx[ii]...}}; + std::size_t target_size = idx.size(); + for (std::size_t i = result.size(); i < target_size; i++) result.push_back(idx[i]); + return result; +} + +template +struct tensor_static_symgroup_do_apply; + +template +struct tensor_static_symgroup_do_apply> { + template + static inline RV run(const std::array& idx, RV initial, Args&&... args) { + static_assert(NumIndices >= SGNumIndices, + "Can only apply symmetry group to objects that have at least the required amount of indices."); + typedef typename internal::gen_numeric_list::type remaining_indices; + initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), + first::flags, initial, std::forward(args)...); + return tensor_static_symgroup_do_apply>::template run( + idx, initial, args...); + } + + template + static inline RV run(const std::vector& idx, RV initial, Args&&... args) { + eigen_assert(idx.size() >= SGNumIndices && + "Can only apply symmetry group to objects that have at least the required amount of indices."); + initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, + std::forward(args)...); + return tensor_static_symgroup_do_apply>::template run( + idx, initial, args...); + } +}; + +template +struct tensor_static_symgroup_do_apply> { + template + static inline RV run(const std::array&, RV initial, Args&&...) { + // do nothing + return initial; + } + + template + static inline RV run(const std::vector&, RV initial, Args&&...) { + // do nothing + return initial; + } +}; + +} // end namespace internal + +template +class StaticSGroup { + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices::value; + typedef internal::group_theory::enumerate_group_elements< + internal::tensor_static_symgroup_multiply, internal::tensor_static_symgroup_equality, + typename internal::tensor_static_symgroup_identity_ctor::type, + internal::type_list::type...>> + group_elements; + typedef typename group_elements::type ge; + + public: + constexpr inline StaticSGroup() {} + constexpr inline StaticSGroup(const StaticSGroup&) {} + constexpr inline StaticSGroup(StaticSGroup&&) {} + + template + static inline RV apply(const std::array& idx, RV initial, Args&&... args) { + return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); + } + + template + static inline RV apply(const std::vector& idx, RV initial, Args&&... args) { + eigen_assert(idx.size() == NumIndices); + return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); + } + + constexpr static std::size_t static_size = ge::count; + + constexpr static inline std::size_t size() { return ge::count; } + constexpr static inline int globalFlags() { return group_elements::global_flags; } + + template + inline internal::tensor_symmetry_value_setter> operator()( + Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, + "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + } + + template + inline internal::tensor_symmetry_value_setter> operator()( + Tensor_& tensor, std::array const& indices) const { + return internal::tensor_symmetry_value_setter>(tensor, *this, indices); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h new file mode 100644 index 00000000000..2d3ff466f37 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h @@ -0,0 +1,335 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +namespace Eigen { + +enum { NegationFlag = 0x01, ConjugationFlag = 0x02 }; + +enum { GlobalRealFlag = 0x01, GlobalImagFlag = 0x02, GlobalZeroFlag = 0x03 }; + +namespace internal { + +template +struct tensor_symmetry_pre_analysis; +template +struct tensor_static_symgroup; +template +struct tensor_static_symgroup_if; +template +struct tensor_symmetry_calculate_flags; +template +struct tensor_symmetry_assign_value; +template +struct tensor_symmetry_num_indices; + +} // end namespace internal + +template +struct Symmetry { + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = 0; +}; + +template +struct AntiSymmetry { + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = NegationFlag; +}; + +template +struct Hermiticity { + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = ConjugationFlag; +}; + +template +struct AntiHermiticity { + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = ConjugationFlag | NegationFlag; +}; + +/** \class DynamicSGroup + * \ingroup TensorSymmetry_Module + * + * \brief Dynamic symmetry group + * + * The %DynamicSGroup class represents a symmetry group that need not be known at + * compile time. It is useful if one wants to support arbitrary run-time defineable + * symmetries for tensors, but it is also instantiated if a symmetry group is defined + * at compile time that would be either too large for the compiler to reasonably + * generate (using templates to calculate this at compile time is very inefficient) + * or that the compiler could generate the group but that it wouldn't make sense to + * unroll the loop for setting coefficients anymore. + */ +class DynamicSGroup; + +/** \internal + * + * \class DynamicSGroupFromTemplateArgs + * \ingroup TensorSymmetry_Module + * + * \brief Dynamic symmetry group, initialized from template arguments + * + * This class is a child class of DynamicSGroup. It uses the template arguments + * specified to initialize itself. + */ +template +class DynamicSGroupFromTemplateArgs; + +/** \class StaticSGroup + * \ingroup TensorSymmetry_Module + * + * \brief Static symmetry group + * + * This class represents a symmetry group that is known and resolved completely + * at compile time. Ideally, no run-time penalty is incurred compared to the + * manual unrolling of the symmetry. + * + * CAUTION: + * + * Do not use this class directly for large symmetry groups. The compiler + * may run into a limit, or segfault or in the very least will take a very, + * very, very long time to compile the code. Use the SGroup class instead + * if you want a static group. That class contains logic that will + * automatically select the DynamicSGroup class instead if the symmetry + * group becomes too large. (In that case, unrolling may not even be + * beneficial.) + */ +template +class StaticSGroup; + +/** \class SGroup + * \ingroup TensorSymmetry_Module + * + * \brief Symmetry group, initialized from template arguments + * + * This class represents a symmetry group whose generators are already + * known at compile time. It may or may not be resolved at compile time, + * depending on the estimated size of the group. + * + * \sa StaticSGroup + * \sa DynamicSGroup + */ +template +class SGroup : public internal::tensor_symmetry_pre_analysis::value, + Gen...>::root_type { + public: + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices::value; + typedef typename internal::tensor_symmetry_pre_analysis::root_type Base; + + // make standard constructors + assignment operators public + inline SGroup() : Base() {} + inline SGroup(const SGroup& other) : Base(other) {} + inline SGroup(SGroup&& other) : Base(other) {} + inline SGroup& operator=(const SGroup& other) { + Base::operator=(other); + return *this; + } + inline SGroup& operator=(SGroup&& other) { + Base::operator=(other); + return *this; + } + + // all else is defined in the base class +}; + +namespace internal { + +template +struct tensor_symmetry_num_indices { + constexpr static std::size_t value = 1; +}; + +template +struct tensor_symmetry_num_indices, Sym...> { + private: + constexpr static std::size_t One = static_cast(One_); + constexpr static std::size_t Two = static_cast(Two_); + constexpr static std::size_t Three = tensor_symmetry_num_indices::value; + + // don't use std::max, since it's not constexpr until C++14... + constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1; + + public: + constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three; +}; + +template +struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; +template +struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; +template +struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; + +/** \internal + * + * \class tensor_symmetry_pre_analysis + * \ingroup TensorSymmetry_Module + * + * \brief Pre-select whether to use a static or dynamic symmetry group + * + * When a symmetry group could in principle be determined at compile time, + * this template implements the logic whether to actually do that or whether + * to rather defer that to runtime. + * + * The logic is as follows: + *
+ *
No generators (trivial symmetry):
+ *
Use a trivial static group. Ideally, this has no performance impact + * compared to not using symmetry at all. In practice, this might not + * be the case.
+ *
More than 4 generators:
+ *
Calculate the group at run time, it is likely far too large for the + * compiler to be able to properly generate it in a realistic time.
+ *
Up to and including 4 generators:
+ *
Actually enumerate all group elements, but then check how many there + * are. If there are more than 16, it is unlikely that unrolling the + * loop (as is done in the static compile-time case) is sensible, so + * use a dynamic group instead. If there are at most 16 elements, actually + * use that static group. Note that the largest group with 4 generators + * still compiles with reasonable resources.
+ *
+ * + * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470 + * with 16 GiB RAM (all generators non-redundant and the subgroups don't + * factorize): + * + * # Generators -O0 -ggdb -O2 + * ------------------------------------------------------------------- + * 1 0.5 s / 250 MiB 0.45s / 230 MiB + * 2 0.5 s / 260 MiB 0.5 s / 250 MiB + * 3 0.65s / 310 MiB 0.62s / 310 MiB + * 4 2.2 s / 860 MiB 1.7 s / 770 MiB + * 5 130 s / 13000 MiB 120 s / 11000 MiB + * + * It is clear that everything is still very efficient up to 4 generators, then + * the memory and CPU requirements become unreasonable. Thus we only instantiate + * the template group theory logic if the number of generators supplied is 4 or + * lower, otherwise this will be forced to be done during runtime, where the + * algorithm is reasonably fast. + */ +template +struct tensor_symmetry_pre_analysis { + typedef StaticSGroup<> root_type; +}; + +template +struct tensor_symmetry_pre_analysis { + constexpr static std::size_t max_static_generators = 4; + constexpr static std::size_t max_static_elements = 16; + typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper; + constexpr static std::size_t possible_size = helper::size; + + typedef std::conditional_t= max_static_elements, + DynamicSGroupFromTemplateArgs, typename helper::type> + root_type; +}; + +template +struct tensor_static_symgroup_if { + constexpr static std::size_t size = 0; + typedef void type; +}; + +template +struct tensor_static_symgroup_if : tensor_static_symgroup {}; + +template +struct tensor_symmetry_assign_value { + typedef typename Tensor_::Index Index; + typedef typename Tensor_::Scalar Scalar; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + static inline int run(const std::array& transformed_indices, int transformation_flags, int dummy, + Tensor_& tensor, const Scalar& value_) { + Scalar value(value_); + if (transformation_flags & ConjugationFlag) value = numext::conj(value); + if (transformation_flags & NegationFlag) value = -value; + tensor.coeffRef(transformed_indices) = value; + return dummy; + } +}; + +template +struct tensor_symmetry_calculate_flags { + typedef typename Tensor_::Index Index; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + static inline int run(const std::array& transformed_indices, int transform_flags, + int current_flags, const std::array& orig_indices) { + if (transformed_indices == orig_indices) { + if (transform_flags & (ConjugationFlag | NegationFlag)) + return current_flags | GlobalImagFlag; // anti-hermitian diagonal + else if (transform_flags & ConjugationFlag) + return current_flags | GlobalRealFlag; // hermitian diagonal + else if (transform_flags & NegationFlag) + return current_flags | GlobalZeroFlag; // anti-symmetric diagonal + } + return current_flags; + } +}; + +template +class tensor_symmetry_value_setter { + public: + typedef typename Tensor_::Index Index; + typedef typename Tensor_::Scalar Scalar; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, + std::array const& indices) + : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) {} + + inline tensor_symmetry_value_setter& operator=(Scalar const& value) { + doAssign(value); + return *this; + } + + private: + Tensor_& m_tensor; + Symmetry_ m_symmetry; + std::array m_indices; + + inline void doAssign(Scalar const& value) { +#ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES + int value_flags = m_symmetry.template apply, int>( + m_indices, m_symmetry.globalFlags(), m_indices); + if (value_flags & GlobalRealFlag) eigen_assert(numext::imag(value) == 0); + if (value_flags & GlobalImagFlag) eigen_assert(numext::real(value) == 0); +#endif + m_symmetry.template apply, int>(m_indices, 0, m_tensor, value); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h new file mode 100644 index 00000000000..aa16f3cd448 --- /dev/null +++ b/wpimath/src/main/native/thirdparty/eigen/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h @@ -0,0 +1,492 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H +#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H + +// IWYU pragma: private +#include "../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +namespace group_theory { + +/** \internal + * \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h + * This file contains C++ templates that implement group theory algorithms. + * + * The algorithms allow for a compile-time analysis of finite groups. + * + * Currently only Dimino's algorithm is implemented, which returns a list + * of all elements in a group given a set of (possibly redundant) generators. + * (One could also do that with the so-called orbital algorithm, but that + * is much more expensive and usually has no advantages.) + */ + +/********************************************************************** + * "Ok kid, here is where it gets complicated." + * - Amelia Pond in the "Doctor Who" episode + * "The Big Bang" + * + * Dimino's algorithm + * ================== + * + * The following is Dimino's algorithm in sequential form: + * + * Input: identity element, list of generators, equality check, + * multiplication operation + * Output: list of group elements + * + * 1. add identity element + * 2. remove identities from list of generators + * 3. add all powers of first generator that aren't the + * identity element + * 4. go through all remaining generators: + * a. if generator is already in the list of elements + * -> do nothing + * b. otherwise + * i. remember current # of elements + * (i.e. the size of the current subgroup) + * ii. add all current elements (which includes + * the identity) each multiplied from right + * with the current generator to the group + * iii. add all remaining cosets that are generated + * by products of the new generator with itself + * and all other generators seen so far + * + * In functional form, this is implemented as a long set of recursive + * templates that have a complicated relationship. + * + * The main interface for Dimino's algorithm is the template + * enumerate_group_elements. All lists are implemented as variadic + * type_list and numeric_list + * templates. + * + * 'Calling' templates is usually done via typedefs. + * + * This algorithm is an extended version of the basic version. The + * extension consists in the fact that each group element has a set + * of flags associated with it. Multiplication of two group elements + * with each other results in a group element whose flags are the + * XOR of the flags of the previous elements. Each time the algorithm + * notices that a group element it just calculated is already in the + * list of current elements, the flags of both will be compared and + * added to the so-called 'global flags' of the group. + * + * The rationale behind this extension is that this allows not only + * for the description of symmetries between tensor indices, but + * also allows for the description of hermiticity, antisymmetry and + * antihermiticity. Negation and conjugation each are specific bit + * in the flags value and if two different ways to reach a group + * element lead to two different flags, this poses a constraint on + * the allowed values of the resulting tensor. For example, if a + * group element is reach both with and without the conjugation + * flags, it is clear that the resulting tensor has to be real. + * + * Note that this flag mechanism is quite generic and may have other + * uses beyond tensor properties. + * + * IMPORTANT: + * This algorithm assumes the group to be finite. If you try to + * run it with a group that's infinite, the algorithm will only + * terminate once you hit a compiler limit (max template depth). + * Also note that trying to use this implementation to create a + * very large group will probably either make you hit the same + * limit, cause the compiler to segfault or at the very least + * take a *really* long time (hours, days, weeks - sic!) to + * compile. It is not recommended to plug in more than 4 + * generators, unless they are independent of each other. + */ + +/** \internal + * + * \class strip_identities + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Cleanse a list of group elements of the identity element + * + * This template is used to make a first pass through all initial + * generators of Dimino's algorithm and remove the identity + * elements. + * + * \sa enumerate_group_elements + */ +template