diff --git a/.github/workflows/testing-gcc.yml b/.github/workflows/testing-gcc.yml index e60a886afa4..706a0989e85 100644 --- a/.github/workflows/testing-gcc.yml +++ b/.github/workflows/testing-gcc.yml @@ -34,6 +34,7 @@ jobs: - name: Test run: | + export LSAN_OPTIONS=suppressions=$PWD/tools/docker/lsan.supp cd build ctest --output-on-failure diff --git a/.github/workflows/testing-macos.yml b/.github/workflows/testing-macos.yml index 5d32ea8b48a..12cf73a9421 100644 --- a/.github/workflows/testing-macos.yml +++ b/.github/workflows/testing-macos.yml @@ -6,6 +6,10 @@ on: - 'develop' pull_request: +# Workaround issue in Xcode 14.1/2 +env: + DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer + jobs: build-and-test: runs-on: macos-latest @@ -16,7 +20,7 @@ jobs: use_openmp: [OPENMP=ON] use_smm: [SMM=blas] blas_impl: [accelerate,openblas] - mpi_suffix: [openmpi,mpich] + mpi_suffix: [openmpi] exclude: - use_mpi: MPI=OFF mpi_suffix: mpich @@ -27,19 +31,14 @@ jobs: fetch-depth: 0 submodules: true - - name: Install dependencies + - name: Install common dependencies run: | env HOMEBREW_NO_AUTO_UPDATE=1 brew install \ - ninja \ - openmpi - - - name: Unlink OpenMPI - run: | - brew unlink openmpi + ninja - - name: Install MPICH + - name: Install ${{ matrix.mpi_suffix }} run: | - env HOMEBREW_NO_AUTO_UPDATE=1 brew install mpich + env HOMEBREW_NO_AUTO_UPDATE=1 brew install ${{ matrix.mpi_suffix }} - name: Configure run: | @@ -53,7 +52,6 @@ jobs: -DUSE_${{ matrix.use_openmp }} \ -DUSE_${{ matrix.use_smm }} \ $([ "${{ matrix.blas_impl }}" = "openblas" ] && echo '-DCMAKE_PREFIX_PATH=/usr/local/opt/openblas') \ - -DMPIEXEC_EXECUTABLE="$([ "${{ matrix.mpi_suffix }}" = "openmpi" ] && command -v /usr/local/Cellar/open-mpi/*/bin/mpiexec || command -v /usr/local/Cellar/mpich/*/bin/mpiexec)" \ -DMPIEXEC_PREFLAGS="$([ "${{ matrix.mpi_suffix }}" = "openmpi" ] && echo "-mca btl ^openib --allow-run-as-root")" \ -DTEST_MPI_RANKS=1 \ .. diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9b8f82a6d71..1def9bcfb38 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,10 +25,18 @@ repos: - id: check-yaml - id: check-symlinks - id: trailing-whitespace + exclude: >- + (?x)^( + tools/vecLibFort/.*| + )$ - repo: https://github.com/pseewald/fprettify rev: v0.3.7 hooks: - id: fprettify + exclude: >- + (?x)^( + tools/vecLibFort/.*| + )$ - repo: https://github.com/cheshirekow/cmake-format-precommit rev: v0.6.13 hooks: @@ -64,3 +72,8 @@ repos: files: \.(c|cc|cxx|cpp|cl|frag|glsl|h|hpp|hxx|ih|ispc|ipp|java|js|m|mm|proto|textproto|vert)$ args: ['-i', '-fallback-style=none', '--style=file'] additional_dependencies: ['clang-format'] + exclude: >- + (?x)^( + tools/vecLibFort/.*| + )$ + diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a174910f21..d1dd70b41c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,23 @@ endif () find_package(LAPACK REQUIRED) # needed for some of the integrated test routines, # also calls find_package(BLAS) +if (APPLE + AND (BLAS_LIBRARIES MATCHES "Accelerate" + OR BLAS_LIBRARIES MATCHES "vecLib" # automated search + OR BLA_VENDOR STREQUAL "Accelerate" + OR BLA_VENDOR STREQUAL "NAS" # user override + )) + message(CHECK_START "Looking for vecLibFort library") + find_library(VECLIBFORT_LIBRARY vecLibFort) + if (NOT VECLIBFORT_LIBRARY) + message(CHECK_FAIL "not found, building it") + add_subdirectory(tools/vecLibFort) + set(VECLIBFORT_LIBRARY vecLibFort) + else () + message(CHECK_PASS "found at " ${VECLIBFORT_LIBRARY}) + endif () +endif () + # =================================== Python this module looks preferably for # version 3 of Python. If not found, version 2 is searched. In CMake 3.15, if a # python virtual environment is activated, it will search the virtual diff --git a/docs/guide/2-user-guide/1-installation/index.md b/docs/guide/2-user-guide/1-installation/index.md index fc06dbb2b86..315edf4438a 100644 --- a/docs/guide/2-user-guide/1-installation/index.md +++ b/docs/guide/2-user-guide/1-installation/index.md @@ -9,8 +9,12 @@ You need: * [CMake](https://cmake.org/) (3.22+) * GNU make or Ninja * Fortran compiler which supports at least Fortran 2008 (including the TS 29113 when using the C-bindings) -* BLAS+LAPACK implementation (reference, OpenBLAS and MKL have been tested. Note: DBCSR linked to OpenBLAS 0.3.6 gives wrong results on Power9 architectures.) -* Python version installed (2.7 or 3.6+ have been tested) +* BLAS+LAPACK implementation + * Reference BLAS/LAPACK, OpenBLAS and MKL have been tested and can be considered supported. + * On macOS [vecLibFort](https://github.com/mcg1969/vecLibFort) is required to use Accelerate and/or vecLib. + The build system will automatically build a bundled version if not found on the system. + * DBCSR linked to OpenBLAS 0.3.6 gives wrong results on Power9 architectures. +* Python version installed (3.6+ have been tested) Optional: diff --git a/docs/guide/3-developer-guide/3-programming/1-overview/index.md b/docs/guide/3-developer-guide/3-programming/1-overview/index.md index 087667bbce9..88d37a270f9 100644 --- a/docs/guide/3-developer-guide/3-programming/1-overview/index.md +++ b/docs/guide/3-developer-guide/3-programming/1-overview/index.md @@ -46,7 +46,6 @@ Assumed square matrix with 20x20 matrix with 5x5 blocks and a 2x2 processor grid | `__NO_STATM_ACCESS`, `__STATM_RESIDENT` or `__STATM_TOTAL` | Toggle memory usage reporting between resident memory and total memory. In particular, macOS users must use `-D__NO_STATM_ACCESS` | Fortran | | `__NO_ABORT` | Avoid calling abort, but STOP instead (useful for coverage testing, and to avoid core dumps on some systems) | Fortran | | `__LIBXSMM` | Enable [LIBXSMM](https://github.com/hfp/libxsmm/) link for optimized small matrix multiplications on CPU | Fortran | -| `__ACCELERATE` | Must be defined on macOS when Apple's Accelerate framework is used for BLAS and LAPACK (this is due to some interface incompatibilities between Accelerate and reference BLAS/LAPACK) | Fortran | | `NDEBUG` | Assertions are stripped ("compiled out"), `NDEBUG` is the ANSI-conforming symbol name (not `__NDEBUG`). Regular release builds may carry assertions for safety | Fortran, C, C++ | | `__CRAY_PM_ACCEL_ENERGY` or `__CRAY_PM_ENERGY` | Switch on collectin energy profiling on Cray systems | Fortran | | `__DBCSR_ACC` | Enable Accelerator compilation | Fortran, C, C++ | diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 42934e9b0fa..951c510b86c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -176,8 +176,8 @@ if (APPLE) # fix /proc/self/statm can not be opened on macOS target_compile_definitions(dbcsr PRIVATE __NO_STATM_ACCESS) - if (BLAS_LIBRARIES MATCHES "Accelerate") - target_compile_definitions(dbcsr PRIVATE __ACCELERATE) + if (VECLIBFORT_LIBRARY) + target_link_libraries(dbcsr PRIVATE ${VECLIBFORT_LIBRARY}) endif () endif () @@ -243,6 +243,7 @@ if (USE_ACCEL) target_link_libraries( dbcsr PRIVATE $<$:CUDA::cudart> + $<$:CUDA::cuda_driver> $<$:CUDA::cublas> $<$:CUDA::nvrtc> $<$:CUDA::nvToolsExt> diff --git a/src/acc/hip/acc_hip.h b/src/acc/hip/acc_hip.h index dc4f255fd9e..33800c01eef 100644 --- a/src/acc/hip/acc_hip.h +++ b/src/acc/hip/acc_hip.h @@ -12,7 +12,11 @@ #include #include -#include +#if __has_include() +# include +#else +# include +#endif #include #define ACC(x) hip##x diff --git a/src/mm/dbcsr_mm_common.F b/src/mm/dbcsr_mm_common.F index 937043f23e9..e12a4fbb455 100644 --- a/src/mm/dbcsr_mm_common.F +++ b/src/mm/dbcsr_mm_common.F @@ -579,11 +579,7 @@ SUBROUTINE calc_norms_${nametype1}$ (norms, nblks, & INTEGER :: blk, bp, bpe, row, col REAL(KIND=real_8), EXTERNAL :: DDOT -#if defined (__ACCELERATE) - REAL(KIND=real_8), EXTERNAL :: SDOT -#else REAL(KIND=real_4), EXTERNAL :: SDOT -#endif ! --------------------------------------------------------------------------- diff --git a/src/mm/dbcsr_mm_multrec.F b/src/mm/dbcsr_mm_multrec.F index d8d0420f782..35c77b938de 100644 --- a/src/mm/dbcsr_mm_multrec.F +++ b/src/mm/dbcsr_mm_multrec.F @@ -707,11 +707,7 @@ SUBROUTINE multrec_filtering_${nametype1}$ (filter_eps, nblks, rowi, coli, blkp, REAL(kind=real_8) :: nrm REAL(KIND=real_8), EXTERNAL :: DZNRM2, DDOT -#if defined (__ACCELERATE) - REAL(KIND=real_8), EXTERNAL :: SCNRM2, SDOT -#else REAL(KIND=real_4), EXTERNAL :: SCNRM2, SDOT -#endif REAL(kind=real_8) :: filter_eps_opt diff --git a/src/ops/dbcsr_operations.F b/src/ops/dbcsr_operations.F index 9f5bd5a1747..d16e5c1130c 100644 --- a/src/ops/dbcsr_operations.F +++ b/src/ops/dbcsr_operations.F @@ -1910,11 +1910,7 @@ SUBROUTINE dbcsr_filter_anytype(matrix, eps, method, & TYPE(dbcsr_iterator) :: iter REAL(KIND=real_8), EXTERNAL :: DZNRM2 -#if defined (__ACCELERATE) - REAL(KIND=real_8), EXTERNAL :: SCNRM2 -#else REAL(KIND=real_4), EXTERNAL :: SCNRM2 -#endif ! --------------------------------------------------------------------------- diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6be544b1fd1..2b050add41d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -95,23 +95,20 @@ set(dbcsr_unittest_common_SRCS dbcsr_test_add.F dbcsr_test_multiply.F) # instead of building a full-blown lib, it would be better to simply build an # OBJECT lib, but we would need cmake 3.12 to be able to specify # target_link_libraries on those to get the proper compile flags -add_library(dbcsr_unittest_common STATIC ${dbcsr_unittest_common_SRCS}) +add_library(dbcsr_unittest_common OBJECT ${dbcsr_unittest_common_SRCS}) +target_link_libraries(dbcsr_unittest_common PUBLIC dbcsr) target_link_libraries(dbcsr_unittest_common PUBLIC ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) if (OpenMP_FOUND) target_link_libraries(dbcsr_unittest_common PUBLIC OpenMP::OpenMP_Fortran) endif () -if (APPLE AND BLAS_LIBRARIES MATCHES "Accelerate") - target_compile_definitions(dbcsr_unittest_common PRIVATE __ACCELERATE) -endif () -target_link_libraries(dbcsr_unittest_common PUBLIC dbcsr) - # Compile Fortran tests foreach (dbcsr_test ${DBCSR_TESTS_FTN}) add_executable(${dbcsr_test} ${${dbcsr_test}_SRCS}) - target_link_libraries(${dbcsr_test} dbcsr_unittest_common) + target_link_libraries(${dbcsr_test} PUBLIC dbcsr_unittest_common) set_target_properties(${dbcsr_test} PROPERTIES LINKER_LANGUAGE Fortran) + # register unittest executable with CMake if (USE_MPI) separate_arguments(MPIEXEC_PREFLAGS) @@ -124,7 +121,6 @@ foreach (dbcsr_test ${DBCSR_TESTS_FTN}) add_test(NAME ${dbcsr_test} COMMAND ${dbcsr_test}) endif () if (OpenMP_FOUND) - target_link_libraries(${dbcsr_test} OpenMP::OpenMP_Fortran) set_tests_properties( ${dbcsr_test} PROPERTIES ENVIRONMENT OMP_NUM_THREADS=${TEST_OMP_THREADS}) endif () diff --git a/tests/dbcsr_test_add.F b/tests/dbcsr_test_add.F index 692f9c914d7..30fe02679d3 100644 --- a/tests/dbcsr_test_add.F +++ b/tests/dbcsr_test_add.F @@ -377,11 +377,7 @@ SUBROUTINE dbcsr_check_add(test_name, matrix_a, dense_a_dbcsr, dense_a, dense_b, LOGICAL :: valid REAL(real_4), ALLOCATABLE, DIMENSION(:) :: work_sp -#if defined (__ACCELERATE) - REAL(real_8), EXTERNAL :: clange, slamch, slange -#else REAL(real_4), EXTERNAL :: clange, slamch, slange -#endif REAL(real_8) :: a_norm_dbcsr, a_norm_in, a_norm_out, & b_norm, eps, residual REAL(real_8), ALLOCATABLE, DIMENSION(:) :: work diff --git a/tests/dbcsr_test_multiply.F b/tests/dbcsr_test_multiply.F index 96081a15272..d36474e3b9f 100644 --- a/tests/dbcsr_test_multiply.F +++ b/tests/dbcsr_test_multiply.F @@ -553,11 +553,7 @@ SUBROUTINE dbcsr_check_multiply(test_name, matrix_c, dense_c_dbcsr, dense_a, den LOGICAL :: valid REAL(real_4), ALLOCATABLE, DIMENSION(:) :: work_sp -#if defined (__ACCELERATE) - REAL(real_8), EXTERNAL :: clange, slamch, slange -#else REAL(real_4), EXTERNAL :: clange, slamch, slange -#endif REAL(real_8) :: a_norm, b_norm, c_norm_dbcsr, c_norm_in, & c_norm_out, eps, eps_norm, residual REAL(real_8), ALLOCATABLE, DIMENSION(:) :: work diff --git a/tools/docker/lsan.supp b/tools/docker/lsan.supp index 028f0a11168..4cd8c021695 100644 --- a/tools/docker/lsan.supp +++ b/tools/docker/lsan.supp @@ -1,3 +1,5 @@ # leak due to compiler bug triggered by combination of OOP and ALLOCATABLE leak:__dbcsr_tensor_types_MOD___copy_dbcsr_tensor_types_Dbcsr_tas_dist_t leak:__dbcsr_tensor_types_MOD___copy_dbcsr_tensor_types_Dbcsr_tas_blk_size_t +# similar case, for gcc-13+ +leak:__dbcsr_tas_global_MOD___copy_dbcsr_tas_global_Dbcsr_tas_blk_size_arb diff --git a/tools/vecLibFort/CMakeLists.txt b/tools/vecLibFort/CMakeLists.txt new file mode 100644 index 00000000000..a36b6e15508 --- /dev/null +++ b/tools/vecLibFort/CMakeLists.txt @@ -0,0 +1,11 @@ +add_library(vecLibFort STATIC vecLibFort.c) + +if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + target_compile_options(vecLibFort PRIVATE -flax-vector-conversions) +endif () + +install( + TARGETS vecLibFort + EXPORT DBCSRTargets + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") diff --git a/tools/vecLibFort/LICENSE b/tools/vecLibFort/LICENSE new file mode 100644 index 00000000000..36b7cd93cdf --- /dev/null +++ b/tools/vecLibFort/LICENSE @@ -0,0 +1,23 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/tools/vecLibFort/Makefile b/tools/vecLibFort/Makefile new file mode 100644 index 00000000000..507d894e37e --- /dev/null +++ b/tools/vecLibFort/Makefile @@ -0,0 +1,49 @@ +PREFIX=/usr/local +LIBDIR=$(PREFIX)/lib + +CFLAGS=-O + +NAME=vecLibFort +SOURCE=$(NAME).c +OBJECT=$(NAME).o +LIBRARY=lib$(NAME) +STATIC=$(LIBRARY).a +DYNAMIC=$(LIBRARY).dylib +PRELOAD=$(LIBRARY)I.dylib +INCLUDES=cloak.h static.h +DEPEND=$(INCLUDES) Makefile + +all: static dynamic preload +static: $(STATIC) +dynamic: $(DYNAMIC) +preload: $(PRELOAD) + +$(OBJECT): $(DEPEND) + +$(STATIC): $(OBJECT) + ar -cru $@ $^ + ranlib $@ + +$(DYNAMIC): $(OBJECT) + clang -shared -o $@ $^ \ + -Wl,-reexport_framework -Wl,Accelerate \ + -install_name $(LIBDIR)/$@ + +$(PRELOAD): $(SOURCE) $(DEPEND) + clang -shared $(CFLAGS) -DVECLIBFORT_INTERPOSE -o $@ -O $(SOURCE) \ + -Wl,-reexport_framework -Wl,Accelerate \ + -install_name $(LIBDIR)/$@ + +install: all + mkdir -p $(LIBDIR) + cp -f $(STATIC) $(LIBDIR) + cp -f $(DYNAMIC) $(LIBDIR) + cp -f $(PRELOAD) $(LIBDIR) + +clean: + rm -f $(OBJECT) $(STATIC) $(DYNAMIC) $(PRELOAD) + +check: tester.f90 $(OBJECT) + gfortran -o tester -O $^ -framework Accelerate + ./tester + diff --git a/tools/vecLibFort/README.md b/tools/vecLibFort/README.md new file mode 100644 index 00000000000..a0149458c2a --- /dev/null +++ b/tools/vecLibFort/README.md @@ -0,0 +1,212 @@ +## A GNU Fortran interface to Apple's Accelerate/vecLib BLAS/LAPACK + +### Introduction + +vecLibFort is lightweight but flexible "shim" designed to rectify +the incompatibilities between the Accelerate/vecLib BLAS and LAPACK libraries +shipped with Mac OS X and FORTRAN code compiled with modern compilers +such as [GNU Fortran][]. + +You *will* want this code if you are... + + * compiling your code directly from FORTRAN source; *and* + * using Apple's BLAS and/or LAPACK for your linear algebra; *and* + * using single-precision or complex arithmetic. + +You *will not* need this code if you are... + + * using some other linear algebra package; *or* + * calling BLAS and LAPACK only from C; *or* + * using an alternative BLAS/LAPACK package ([OpenBlas][],[MKL][]); *or* + * using only double-precision real arithmetic. + +You *may* want this code if you are... + + * running a *pre-compiled* program, or linking to a *pre-compiled* + library, that seems to exhibit bugs described in the [next](#background) + section. See the section [Preloaded (interposing) library](#preloaded) + for more details on how you may be able to fix these programs without + recompilation. + + +### Background + +[Apple's vecLib framework][vecLib] provides both C and FORTRAN bindings for +BLAS and LAPACK, the de-facto standard libraries for dense numerical linear +algebra. Because there remains quite a bit of useful FORTRAN code out there +that in turn depend on BLAS and LAPACK, this is certainly a welcome provision +from Apple. + +Unfortunately, those FORTRAN bindings follow an [F2C][]-style return value +convention, while [GNU Fortran][] uses a [different convention][gnufarg]. Most +subroutines and functions work without modification; in particular, if you +rely solely on double-precision *real* arithmetic, you are fine. For single +precision or complex arithmetic, there are two fatal incompatibilities: + +* Functions whose FORTRAN specifications call for returning single-precision + real values, such as ``sdot_`` and ``snrm2_``, actually return + *double-precision* results in the Apple/F2C calling convention. GNU Fortran, + on the other hand, expects to receive the single-precision result. +* Functions designed to return complex values, whether single-precision or + double-precision, are converted to subroutines in the Apple/F2C convention, + with a pointer to the return value serving as the first argument. (Note that + this differs from the CBLAS convention of passing a pointer to the + return value as the *final* argument.) GNU Fortran, on the other hand, + expects these values to be returned as a C-style return value. + +For programs that use single-precision or complex arithmetic, then, these +incompatibilities *must* be addressed or incorrect results and crashes can +occur. In some projects, these errors go uncorrected, because the use cases +that exercise them are uncommon. + +One solution is to force GNU Fortran to adopt the older, F2C-style return +value convention, using the ``-ff2c`` flag. If that solution is sufficient +for you, then I encourage you to adopt it. Unfortunately, this may not be +possible if there is other code or other libraries that you rely upon that +assume the default GNU Fortran convention. And don't forget to rewrite your +C code according to the F2C return value conventions. + +The approach taken by vecLibFort is to provide a thin translation layer +between the F2C and GFortran worlds, for the few functions where there is a +difference. For BLAS, this is simply a matter of wrapping Apple's CBLAS +calls in a FORTRAN-friendly wrapper. For LAPACK, a bit of dlopen/dlsym +trickery is required to avoid name conflicts. + +Still another option is to use a different BLAS and LAPACK library, such +as [MKL][] or [OpenBlas][]. I am sure there are good arguments to be made +for all three options. + +### Using vecLibFort + +This code can be used in one of three ways, and the included ``Makefile`` +builds all three for you. The only variable you may want to modify is the +``PREFIX`` variable, which determines the install location ``$(PREFIX)/lib``. + +#### Dynamic library: ``libvecLibFort.dylib`` + +The most straightforward way to use ``vecLibFort`` is by linking with the +standard dynamic library using ``-lvecLibFort``. Of course, if you installed +the library in a non-standard location, you will need an ``-L`` linker +flag as well. + +If you use this apporach, you do *not* need to add ``-framework vecLib`` or +``-framework Accelerate`` as well. That is because vecLibFort is built to +re-export all of Accelerate's symbols, even those it does not "fix". Thus it +serves as a *full replacement* for vecLib/Accelerate. + +#### Static library / direct inclusion: ``libvecLibFort.a`` + +For new projects, feel free to add ``vecLibFort.c``, ``static.h``, and +``cloak.h`` to your project, or link with the static library. You will also +need to link ``-framework vecLib`` or ``-framework Accelerate``. + + +#### Preloaded (interposing) library: ``libvecLibFortI.dylib`` + +Suppose you have a program that is already compiled, but which apparently +exhibits the errors discussed herein. Or perhaps you are using a precompiled +third-party library that has not implemented measures like these itself; but +because it has already been linked to vecLib, the bugs are baked in. (If you +can alter the linking information of a dynamic library, I bow to your skill.) + +In these cases, there is a *preload* feature of Mac OSX's ``dyld`` system that +can come in quite handy. The OS makes it possible to specify a library to be +*preloaded* before the application, with a list of instructions to replace +functions with alternate versions, a process known as *interposing*. The +source file ``vecLibFort.c`` includes this interposing code, but it is +wrapped with ``#ifdef VECLIBFORT_INTERPOSE`` to avoid clashing with the +non-interposing code. + +To use this library, you must add the full path to ``libvecLibFortI.dylib`` +to the [``DYLD_INSERT_LIBRARIES`` environment variable][DYLD]. For instance, +if it has been installed in the default location, the command + + DYLD_INSERT_LIBRARIES=/usr/local/lib/libvecLibFortI.dylib program + +will run the program ``program`` but with the BLAS and LAPACK calls corrected. + +Of course, this may not work---it may be that the bugs you are seeing are not +in fact caused by the specific issues addressed by vecLibFort. Or I might not +have implemented something correctly. (Bug reports are welcome.) And you +should *not* use this if the program or library *already* uses the F2C +calling conventions correctly; you *will* break it. + +### Inspirations + +This code in ``vecLibFort.c`` is new, but the concepts that undergird it are +most certainly not. The inspirations include: + +* The [dotwrp project][dotwrp] project provides a simple FORTRAN-based wrapper + for the 5 most common problematic BLAS functions. Thanks to vecLib's CBLAS + interface, the substitutions can be made statically. We have extended this + approach to cover all of the relevant BLAS calls, and implemented it in C. +* The dynamic substitution approach is heavily inspired by the method used by + [GNU Octave](https://www.gnu.org/software/octave/), as contributed by Jarno + Rajahaime. You can see the [here][blaswrap]. vecLibFort differs from Octave + in that it resolves the replacements lazily, eliminating the need for + lookup tables and (hopefully) improving performance. It also implements the + full set of BLAS/LAPACK replacements, whereas Octave replaces only a subset. +* The interposing implementation is explained in a variety of places on the + Internet, including section 2.6.3.4 of Amit Singh's book "Mac OSX + Internals." (http://osxbook.com). Point your favorite search engine to the + term [``DYLD_INSERT_LIBRARIES``][Google] to find a wealth of material. +* In order to make the primary source file as compact as possible, this code + employs a simple preprocessor library by Paul Fultz II called [Cloak][]. The + [Boost Preprocessor Library][Boost] is perhaps a more well known example + of this kind of work, but it is far more complex than needed in this case. + +### License + +##### English + +I've released this under the [Boost Software License][boost]. So do whatever +you wish with it. You do not have to redistribute the source code; but if you +do, you must include the license with it. + +If you do use this in your projects, I would appreciate it if you would give +me credit, as I have attempted to do in the previous section. But I'm not +going to get bent out of shape about it. Large piles of cash are welcome, as +are simple emails of gratitude, or pull requests! + +##### Legalese + +> Boost Software License - Version 1.0 - August 17th, 2003 +> +> Permission is hereby granted, free of charge, to any person or organization +> obtaining a copy of the software and accompanying documentation covered by +> this license (the "Software") to use, reproduce, display, distribute, +> execute, and transmit the Software, and to prepare derivative works of the +> Software, and to permit third-parties to whom the Software is furnished to +> do so, all subject to the following: +> +> The copyright notices in the Software and this entire statement, including +> the above license grant, this restriction and the following disclaimer, +> must be included in all copies of the Software, in whole or in part, and +> all derivative works of the Software, unless such copies or derivative +> works are solely in the form of machine-executable object code generated by +> a source language processor. +> +> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +> FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +> SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +> FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +> ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +> DEALINGS IN THE SOFTWARE. + +[vecLib]:https://developer.apple.com/library/mac/documentation/Performance/Conceptual/vecLib/Reference/reference.html +[GNU Fortran]:http://gcc.gnu.org/fortran/ +[gnufarg]:http://gcc.gnu.org/onlinedocs/gfortran/Argument-passing-conventions.html +[F2C]:http://www.netlib.org/f2c/ +[DYLD]:https://developer.apple.com/library/mac/documentation/Darwin/Reference/ManPages/man1/dyld.1.html +[dotwrp]:https://github.com/tenomoto/dotwrp +[GNU Octave]:https://www.gnu.org/software/octave/ +[blaswrap]:http://hg.savannah.gnu.org/hgweb/octave/file/tip/liboctave/cruft/misc/blaswrap.c +[Google]:https://www.google.com/search?q=DYLD_INSERT_LIBRARIES +[Cloak]:https://github.com/pfultz2/Cloak/blob/master/cloak.h +[Boost]:http://www.boost.org/doc/libs/1_55_0/libs/preprocessor/doc/index.html +[OpenBLAS]:http://www.openblas.net/ +[MKL]:http://software.intel.com/en-us/intel-mkl +[blasbug]:http://www.macresearch.org/lapackblas-fortran-106 +[boost]:http://www.boost.org/users/license.html + diff --git a/tools/vecLibFort/cloak.h b/tools/vecLibFort/cloak.h new file mode 100644 index 00000000000..675b21de802 --- /dev/null +++ b/tools/vecLibFort/cloak.h @@ -0,0 +1,113 @@ +/* + + Cloak + https://github.com/pfultz2/Cloak + A mini preprocessor library + Copyright (c) 2012-2014 Paul Fultz II + + Use, modification and distribution is subject to the Boost Software + License, Version 1.0. See the accompanying file LICENSE or + + http://www.booost.org/LICENSE_1_0.txt + + Permission granted by the author to include this file in vecLibFort + under the terms of this license. + +*/ + +#define CAT(a, ...) PRIMITIVE_CAT(a, __VA_ARGS__) +#define PRIMITIVE_CAT(a, ...) a ## __VA_ARGS__ + +#define INC(x) PRIMITIVE_CAT(INC_, x) +#define INC_0 1 +#define INC_1 2 +#define INC_2 3 +#define INC_3 4 +#define INC_4 5 +#define INC_5 6 +#define INC_6 7 +#define INC_7 8 +#define INC_8 9 +#define INC_9 10 +#define INC_10 11 +#define INC_11 12 + +#define DEC(x) PRIMITIVE_CAT(DEC_, x) +#define DEC_0 0 +#define DEC_1 0 +#define DEC_2 1 +#define DEC_3 2 +#define DEC_4 3 +#define DEC_5 4 +#define DEC_6 5 +#define DEC_7 6 +#define DEC_8 7 +#define DEC_9 8 +#define DEC_10 9 +#define DEC_11 10 + +#define EXPR_S(s) PRIMITIVE_CAT(EXPR_, s) +#define EXPR_0(...) __VA_ARGS__ +#define EXPR_1(...) __VA_ARGS__ +#define EXPR_2(...) __VA_ARGS__ +#define EXPR_3(...) __VA_ARGS__ +#define EXPR_4(...) __VA_ARGS__ +#define EXPR_5(...) __VA_ARGS__ +#define EXPR_6(...) __VA_ARGS__ +#define EXPR_7(...) __VA_ARGS__ +#define EXPR_8(...) __VA_ARGS__ +#define EXPR_9(...) __VA_ARGS__ +#define EXPR_10(...) __VA_ARGS__ +#define EXPR_11(...) __VA_ARGS__ +#define EXPR_12(...) __VA_ARGS__ + +#define CHECK_N(x, n, ...) n +#define CHECK(...) CHECK_N(__VA_ARGS__, 0,) + +#define NOT(x) CHECK(PRIMITIVE_CAT(NOT_, x)) +#define NOT_0 ~, 1, + +#define COMPL(b) PRIMITIVE_CAT(COMPL_, b) +#define COMPL_0 1 +#define COMPL_1 0 + +#define BOOL(x) COMPL(NOT(x)) + +#define IIF(c) PRIMITIVE_CAT(IIF_, c) +#define IIF_0(t, ...) __VA_ARGS__ +#define IIF_1(t, ...) t + +#define IF(c) IIF(BOOL(c)) + +#define EAT(...) +#define EXPAND(...) __VA_ARGS__ +#define WHEN(c) IF(c)(EXPAND, EAT) + +#define EMPTY() +#define DEFER(id) id EMPTY() +#define OBSTRUCT(id) id DEFER(EMPTY)() + +//#define REPEAT_S(s, n, m, ...) \ +// IF(n)(REPEAT_I, EAT)(OBSTRUCT(), INC(s), DEC(n), m, __VA_ARGS__) +// +//#define REPEAT_INDIRECT() REPEAT_S +//#define REPEAT_I(_, s, n, m, ...) \ +// EXPR_S _(s)( \ +// REPEAT_INDIRECT _()(s, n, m, __VA_ARGS__) \ +// )\ +// m _(s, n, __VA_ARGS__) + +#define REPEAT_S(s, n, m, ...) \ + REPEAT_I(OBSTRUCT(), INC(s), n, m, __VA_ARGS__) + +#define REPEAT_INDIRECT() REPEAT_I +#define REPEAT_I(_, s, n, m, ...) \ + WHEN _(n)(EXPR_S _(s)( \ + REPEAT_INDIRECT _()(OBSTRUCT _(), INC _(s), DEC _(n), m, __VA_ARGS__) \ + ))\ + m _(s, n, __VA_ARGS__) + +#define COMMA() , + +#define COMMA_IF(n) IF(n)(COMMA, EAT)() + diff --git a/tools/vecLibFort/static.h b/tools/vecLibFort/static.h new file mode 100644 index 00000000000..5cbbe5fae96 --- /dev/null +++ b/tools/vecLibFort/static.h @@ -0,0 +1,119 @@ +/* + + vecLibFort + https://github.com/mcg1969/vecLibFort + Run-time F2C/GFORTRAN translation for Apple's vecLib BLAS/LAPACK + Copyright (c) 2014 Michael C. Grant + + See README.md for full background and usage details. + + Use, modification and distribution is subject to the Boost Software + License, Version 1.0. See the accompanying file LICENSE or + + http://www.booost.org/LICENSE_1_0.txt + +*/ + +#if defined(ADD_UNDERSCORE) +#define FNAME(x) x ## _ +#define STATIC +#elif defined(ADD_PREFIX) +#define FNAME(x) my_ ## x +#define STATIC static +#else +#define FNAME(x) x +#define STATIC +#endif + +STATIC float FNAME(sdsdot)( const int* N, const float* alpha, const float* X, const int* incX, const float* Y, const int* incY ) +{ + DEBUG_S( "sdsdot" ) + return cblas_sdsdot( *N, *alpha, X, *incX, Y, *incY ); +} + +STATIC float FNAME(sdot)( const int* N, const float* X, const int* incX, const float* Y, const int* incY ) +{ + DEBUG_S( "sdot" ) + return cblas_sdot( *N, X, *incX, Y, *incY ); +} + +STATIC float FNAME(snrm2)( const int* N, const float* X, const int* incX ) +{ + DEBUG_S( "snrm2" ) + return cblas_snrm2( *N, X, *incX ); +} + +STATIC float FNAME(sasum)( const int* N, const float *X, const int* incX ) +{ + DEBUG_S( "sasum" ) + return cblas_sasum( *N, X, *incX ); +} + +STATIC c_float FNAME(cdotu)( const int* N, const void* X, const int* incX, const void* Y, const int* incY ) +{ + DEBUG_S( "cdotu" ) + c_float ans; + cblas_cdotu_sub( *N, X, *incX, Y, *incY, &ans ); + return ans; +} + +STATIC c_float FNAME(cdotc)( const int* N, const void* X, const int* incX, const void* Y, const int* incY ) +{ + DEBUG_S( "cdotc" ) + c_float ans; + cblas_cdotc_sub( *N, X, *incX, Y, *incY, &ans ); + return ans; +} + +STATIC float FNAME(scnrm2)( const int* N, const void* X, const int* incX ) +{ + DEBUG_S( "scnrm2" ) + return cblas_scnrm2( *N, X, *incX ); +} + +STATIC float FNAME(scasum)( const int* N, const void *X, const int* incX ) +{ + DEBUG_S( "scasum" ) + return cblas_scasum( *N, X, *incX ); +} + +STATIC c_double FNAME(zdotu)( const int* N, const void* X, const int* incX, const void* Y, const int* incY ) +{ + DEBUG_S( "zdotu" ) + c_double ans; + cblas_zdotu_sub( *N, X, *incX, Y, *incY, &ans ); + return ans; +} + +STATIC c_double FNAME(zdotc)( const int* N, const void* X, const int* incX, const void* Y, const int* incY ) +{ + DEBUG_S( "zdotc" ) + c_double ans; + cblas_zdotc_sub( *N, X, *incX, Y, *incY, &ans ); + return ans; +} + +#ifdef VECLIBFORT_SGEMV +STATIC void FNAME(sgemv)( const char* trans, const int* m, const int* n, + const float* alpha, const float* A, const int* ldA, + const float* X, const int* incX, + const float* beta, float* Y, const int* incY ) +{ + DEBUG_S( "sgemv" ) + enum CBLAS_TRANSPOSE T; + switch ( trans[0] ) { + case 'T': case 't': + if ( ((intptr_t)X|(intptr_t)A|(intptr_t)Y)%32 == 0 ) { T = CblasTrans; break; } + /* Implement as alpha * X^T * A + beta * Y^T */ + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY ); + return; + default: + T = CblasNoTrans; + } + cblas_sgemv( CblasColMajor, T, *m, *n, *alpha, A, *ldA, X, *incX, *beta, Y, *incY ); +} +#endif + +#undef FNAME +#undef STATIC + diff --git a/tools/vecLibFort/tester.f90 b/tools/vecLibFort/tester.f90 new file mode 100644 index 00000000000..adf2ca7fd7a --- /dev/null +++ b/tools/vecLibFort/tester.f90 @@ -0,0 +1,39 @@ +real, dimension(2,6) :: a +complex, dimension(2,4) :: b +double complex, dimension(2,3) :: c + +real sdot, sdsdot, snrm2, sasum, scnrm2, scasum, slamch +real slange, clange, slansy, clansy +complex cdotu, cdotc +double complex zdotu, zdotc + +a = transpose(reshape([1,3,2,4,3,5, 6,4,5,3,4,2],[6,2])) +b = transpose(reshape([(1,2),(3,4),(5,6),(7,8), (8,1),(7,2),(6,3),(5,4)],[4,2])) +c = transpose(reshape([(3,2),(2,4),(1,6), (4,6),(5,4),(6,2)],[3,2])) + +write(*,*) 'If the return value interface is fixed, none of these values will' +write(*,*) 'be zero, nor will they be nonsensically large or small. On the' +write(*,*) 'other hand, if the translation is incorrect, it is more likely' +write(*,*) 'that this program will carsh.' +write(*,*) ' ' + +write(*,*) sdot(6,a(1,:),1,a(2,:),1), sdsdot(6,2.0,a(1,:),1,a(2,:),1), & + snrm2(6,a(1,:),1), sasum(6,a(2,:),1) +write(*,*) cdotu(4,b(1,:),1,b(2,:),1), cdotc(4,b(1,:),1,b(2,:),1) +write(*,*) scnrm2(4,b(1,:),1), scasum(4,b(2,:),1) +write(*,*) zdotu(3,c(1,:),1,c(2,:),1) +write(*,*) zdotc(3,c(1,:),1,c(2,:),1) + +write(*,*) slange('F',2,6,a,2,a),clange('F',2,4,b,2,b), & + slansy('F','L',2,a,2,a),clansy('F','L',2,a,2,a) + +write(*,*) ' ' +write(*,*) 'These are the machine constants generated by SLAMCH. We expect' +write(*,*) 'some of them to be small (E-08,E-38).' +write(*,*) ' ' + +write(*,*) slamch('E'),slamch('S'),slamch('B') +write(*,*) slamch('P'),slamch('R'),slamch('M') +write(*,*) slamch('U'),slamch('L'),slamch('O') + +end diff --git a/tools/vecLibFort/vecLib-760.100.h b/tools/vecLibFort/vecLib-760.100.h new file mode 100644 index 00000000000..cb0596444f4 --- /dev/null +++ b/tools/vecLibFort/vecLib-760.100.h @@ -0,0 +1,68 @@ +/* + * Modeled from Apple's vecLib-760.10 instance of vecLib.h: + * /Library/Developer/CommandLineTools/SDKs/MacOSX11.3.sdk/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers/vecLib.h + */ + +#ifndef __VECLIB__ +#define __VECLIB__ + +#ifndef __VECLIBTYPES__ +#include +#endif + +#ifndef __VBASICOPS__ +#include +#endif + +#ifndef __VBIGNUM__ +#include +#endif + +#ifndef __VECTOROPS__ +#include +#endif + +#ifndef __VFP__ +#include +#endif + +#ifndef __VDSP__ +#include +#endif + +#if defined __ppc__ || defined __i386__ +#ifndef __VDSP_TRANSLATE__ +#include +#endif +#endif + +#ifndef CBLAS_H +#include +#endif + +#ifndef __CLAPACK_H +#include +#endif + +#ifndef __LINEAR_ALGEBRA_PUBLIC_HEADER__ +#include +#endif + +#ifndef __SPARSE_HEADER__ +#include +#include +#endif + +#ifndef __QUADRATURE_PUBLIC_HEADER__ +#include +#endif // __QUADRATURE_PUBLIC_HEADER__ + +#ifndef __BNNS_HEADER__ +#include +#endif // __BNNS_HEADER__ + +#ifndef __VFORCE_H +#include +#endif + +#endif /* __VECLIB__ */ diff --git a/tools/vecLibFort/vecLibFort.c b/tools/vecLibFort/vecLibFort.c new file mode 100644 index 00000000000..f5ec7c30011 --- /dev/null +++ b/tools/vecLibFort/vecLibFort.c @@ -0,0 +1,301 @@ +/* + + vecLibFort + https://github.com/mcg1969/vecLibFort + Run-time F2C/GFORTRAN translation for Apple's vecLib BLAS/LAPACK + Copyright (c) 2014 Michael C. Grant + + See README.md for full background and usage details. + + Use, modification and distribution is subject to the Boost Software + License, Version 1.0. See the accompanying file LICENSE or + + http://www.booost.org/LICENSE_1_0.txt + +*/ + +#include +#include "cloak.h" +/* Don't load the CLAPACK header, because we are using a different calling + convention for the replaced functions than the ones listed there. */ +#define __CLAPACK_H +#include "vecLib-760.100.h" +#include +#include + +/* Add a SGEMV fix for Mavericks. See + http://www.openradar.me/radar?id=5864367807528960 */ + +#if !defined(VECLIBFORT_SGEMV) && \ + defined(MAC_OS_X_VERSION_10_9) && \ + MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_9 && \ + !(defined(MAC_OS_X_VERSION_10_10) && \ + MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10) +#define VECLIBFORT_SGEMV +#endif + +#define VOIDS_(s,i,id) COMMA_IF(i) void* +#define VOIDS(n) IF(n)(EXPR_S(0)(REPEAT_S(0,DEC(n),VOIDS_,~)),void) +#define VOIDA_(s,i,id) COMMA_IF(i) void *a ## i +#define VOIDA(n) IF(n)(EXPR_S(0)(REPEAT_S(0,DEC(n),VOIDA_,~)),void) +#define PARAM_(s,i,id) COMMA_IF(i)a ## i +#define PARAM(n) IF(n)(EXPR_S(0)(REPEAT_S(0,DEC(n),PARAM_,~)),) + +#ifdef VECLIBFORT_VERBOSE +#define DEBUG(...) fprintf(stderr,__VA_ARGS__); +static const char* dynamic_msg = "Entering dynamic %s replacement\n"; +static const char* static_msg = "Entering static %s replacement\n"; +#define DEBUG_S(x) DEBUG( static_msg, x ) +#define DEBUG_D(x) DEBUG( dynamic_msg, x ) + +#else +#define DEBUG(...) +#define DEBUG_S(x) +#define DEBUG_D(x) +#endif + +#include +typedef float complex c_float; +typedef double complex c_double; + +#ifdef VECLIBFORT_INTERPOSE + +/* + * INTERPOSING MODE + * + * In this mode, dyld is instructed to preload this library even before the + * executable itself. It reads the __DATA.__interpose section of the library + * for the interpose information, which it uses to swap out the offending + * BLAS/LAPACK functions with our replacements. Because vecLib provides two + * aliases for each function---one with a trailing underscore, and one + * without---we need two interpose records for each replacement. + * + * For instance, for "sdot", we define a static function + * static float my_sdot( const int* N, const float* X, const int* incX ) + * add interpose data to signify two substitutions: + * sdot_ -> my_sdot + * sdot -> my_sdot + */ + +typedef struct interpose_t_ { + const void *replacement; + const void *original; +} interpose_t; + +#define INTERPOSE(name) \ +__attribute__((used)) interpose_t interpose_ ## name [] \ +__attribute__((section ("__DATA,__interpose"))) = \ +{ { (const void*)&my_ ## name, (const void*)&name }, \ + { (const void*)&my_ ## name, (const void*)&name ## _ } }; + +#define D2F_CALL(name,n) \ +extern double name( VOIDS(n) ); \ +extern double name ## _( VOIDS(n) ); \ +static float my_ ## name ( VOIDA(n) ) \ +{ return (float)name ## _( PARAM(n) ); } \ +INTERPOSE(name) + +#define CPLX_CALL(type,name,n) \ +extern void name( VOIDS(INC(n)) ); \ +extern void name ## _( VOIDS(INC(n)) ); \ +static c_ ## type my_ ## name ( VOIDA(n) ) \ +{ \ + c_ ## type cplx; \ + name ## _( &cplx, PARAM(n) ); \ + return cplx; \ +} \ +INTERPOSE(name) + +/* + * DYNAMIC BLAS SUBSTITUTION + * + * For the interpose library we need to use the same techniques for the BLAS + * as we do for the LAPACK routines. However, because we have CBLAS versions + * available to use, we can use the wrappers already created in "static.h" + * by prepending them with the "my_" prefixes. + */ + +#define BLS_CALL(type,name,n) \ +extern type name( VOIDS(n) ); \ +extern type name ## _( VOIDS(n) ); \ +INTERPOSE(name) + +#define ADD_PREFIX +#include "static.h" +#undef ADD_PREFIX + +BLS_CALL(float,sdsdot,6) +BLS_CALL(float,sdot,5) +BLS_CALL(float,snrm2,3) +BLS_CALL(float,sasum,3) +BLS_CALL(c_float,cdotu,5) +BLS_CALL(c_float,cdotc,5) +BLS_CALL(float,scnrm2,3) +BLS_CALL(float,scasum,3) +BLS_CALL(c_double,zdotu,5) +BLS_CALL(c_double,zdotc,5) +#if defined(VECLIBFORT_SGEMV) +BLS_CALL(void,sgemv,11) +#endif + +#else + +/* + * STATIC BLAS SUBSTITUTION + * + * For BLAS functions, we have access to CBLAS versions of each function. + * So the hoops we need to jump through to resolve the name clashes in the + * dynamic substitution mode can be avoided. Instead, we simply create the + * replacement functions to call the CBLAS counterparts instead. + * + * To void duplicating code, we include the functions in "static.h" twice: + * once for the functions with trailing underscores (e.g., "sdot_"), and once + * without (e.g., "sdot"). In theory, we could create just one replacement + * with two aliases, but clang has thus far been uncooperative. Any assistance + * on this matter would be appreciated. + */ + +#include "static.h" +#define ADD_UNDERSCORE +#include "static.h" + +/* + * DYNAMIC LAPACK SUBSTITUTION + * + * In this mode, we give our functions identical names, and rely on link + * order to ensure that these take precedence over those declared in vecLib. + * Thus whenever the main code attempts to call one of the covered functions, + * it will be directed to one of our wrappers instead. + * + * Because vecLib provides two aliases for each function---one with a + * trailing underscore, and one without---we actually need two separate + * replacement functions (at least until we can figure out how to do aliases + * cleanly in clang.) Each pair of replacements controls a single static + * pointer to the replacement function. On the first invocation of either, + * this pointer is retrieved using a dlsym() command. + * + * For instance, for "sdot", we define two functions + * float sdot_( const int* N, const float* X, const int* incX ) + * float sdot ( const int* N, const float* X, const int* incX ) + * On the first invocation of either, the "sdot_" symbol from vecLib is + * retrieved using the dlsym() command and stored in + * static void* fp_dot; + * In theory, we could create just one replacement with two aliases, but + * clang has thus far been uncooperative. Any assistance on this matter would + * be appreciated. + */ + +#include +#include +#include + +#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib" + +static void * veclib = 0; + +static void unloadlib(void) +{ + DEBUG( "Unloading vecLib\n" ); + dlclose (veclib); +} + +static void loadlib(void) +{ + static const char* veclib_loc = VECLIB_FILE; + DEBUG( "Loading library: %s\n", veclib_loc ) + veclib = dlopen (veclib_loc, RTLD_LOCAL | RTLD_FIRST); + if ( veclib == 0 ) { + fprintf( stderr, "Failed to open vecLib library; aborting.\n Location: %s\n", veclib ); + abort (); + } + atexit(unloadlib); +} + +static void* loadsym( const char* nm ) +{ + if ( veclib == 0 ) loadlib(); + DEBUG( "Loading function: %s\n", nm ) + void *ans = dlsym( veclib, nm ); + if ( ans != 0 ) return ans; + fprintf( stderr, "vecLib symbol '%s' could not be resolved; aborting.\n", nm ); + abort(); +} + +#define D2F_CALL_(fname,name,n) \ +float fname( VOIDA(n) ) \ +{ \ + DEBUG_D( #name "_" ) \ + if ( !fp_ ## name ) fp_ ## name = loadsym( #name "_" ); \ + return ((ft_ ## name)fp_ ## name)( PARAM(n) ); \ +} + +#define D2F_CALL(name,n) \ +typedef double (*ft_ ## name)( VOIDS(n) ); \ +static void *fp_ ## name = 0; \ +D2F_CALL_(name,name,n) \ +D2F_CALL_(name ## _,name,n) + +#define CPLX_CALL_(type,fname,name,n) \ +c_ ## type fname( VOIDA(n) ) \ +{ \ + c_ ## type cplx; \ + DEBUG_D( #name "_" ) \ + if ( !fp_ ## name ) fp_ ## name = loadsym( #name "_" ); \ + ((ft_ ## name)fp_ ## name)( &cplx, PARAM(n) ); \ + return cplx; \ +} + +#define CPLX_CALL(type,name,n) \ +typedef void (*ft_ ## name)( VOIDS(INC(n)) ); \ +static void *fp_ ## name = 0; \ +CPLX_CALL_(type,name,name,n) \ +CPLX_CALL_(type,name ## _,name,n) + +#endif + +D2F_CALL(clangb,7) +D2F_CALL(clange,6) +D2F_CALL(clangt,5) +D2F_CALL(clanhb,7) +D2F_CALL(clanhe,6) +D2F_CALL(clanhp,5) +D2F_CALL(clanhs,5) +D2F_CALL(clanht,4) +D2F_CALL(clansb,7) +D2F_CALL(clansp,5) +D2F_CALL(clansy,6) +D2F_CALL(clantb,8) +D2F_CALL(clantp,6) +D2F_CALL(clantr,8) + +D2F_CALL(scsum1,3) +#if defined(MAC_OS_X_VERSION_10_6) && \ + MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 +D2F_CALL(slaneg,6) +#endif +D2F_CALL(slangb,7) +D2F_CALL(slange,6) +D2F_CALL(slangt,5) +D2F_CALL(slanhs,5) +D2F_CALL(slansb,7) +D2F_CALL(slansp,5) +D2F_CALL(slanst,4) +D2F_CALL(slansy,6) +D2F_CALL(slantb,8) +D2F_CALL(slantp,6) +D2F_CALL(slantr,8) +D2F_CALL(slapy2,2) +D2F_CALL(slapy3,3) +D2F_CALL(slamch,1) +D2F_CALL(slamc3,2) + +#if defined(MAC_OS_X_VERSION_10_7) && \ + MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 +D2F_CALL(clanhf,6) +D2F_CALL(slansf,6) +#endif + +CPLX_CALL(float,cladiv,2) +CPLX_CALL(double,zladiv,2) + +