build_passes/build.ninja: | build_passes $(TMP_DESTDIR)/$(PREFIX)/lib/cmake/llvm
cd $(@D) && cmake -DCMAKE_INSTALL_PREFIX="$(PREFIX)" \
- -DLLVM_DIR':'STRING=$(TMP_DESTDIR)$(PREFIX)/lib/cmake/llvm -DCMAKE_BUILD_TYPE=DEBUG \
+ -DLLVM_DIR':'STRING=$(TMP_DESTDIR)$(PREFIX)/lib/cmake/llvm \
+ -DCMAKE_BUILD_TYPE=DEBUG \
$(LLVM_CROSS_FLAGS) -DLLVM_TARGETS_TO_BUILD="X86;ARM" \
+ -DCPU_CACHE_SIZE_DEFAULT=524288 \
+ -DGPU_CACHE_SIZE_DEFAULT=524288 \
+ -DGPU_SCRATCHPAD_SIZE_DEFAULT=48000 \
+ -DUSE_HW_CACHES=ON \
+ -DHIERARCHICAL_INTERVALS=ON \
+ -DPREFETCH_REPS=1 \
+ -DUSE_HW_CACHES_PREFETCH=ON \
+ -DUSE_HW_CACHES_INLINEPTX_PREFETCH=OFF \
+ -DUSE_HW_CACHES_LIBCALL=OFF \
+ -DUSE_HW_CACHES_VOLALOAD=OFF \
+ -DUSE_HW_CACHES_SINGLEWRITEBACK=OFF \
+ -DUSE_HW_CACHES_INDWRITEBACK_LIBCALL=ON \
+ -DUSE_HW_CACHES_INDWRITEBACK_INLINE=OFF \
+ -DULES_EXTERNAL_LINKAGE=OFF \
+ -DALWAYS_INLINE_UNSPECIALIZED=OFF \
+ -DALWAYS_INLINE_LOAD=OFF \
+ -DALWAYS_INLINE_EXECUTE=OFF \
+ -DALWAYS_INLINE_STORE=OFF \
+ -DAGGRESSIVELY_INLINE_CALL_TREE=OFF \
-G "Ninja" $(CURDIR)/HerculesCompiler/llvm-passes
passes: build_passes/build.ninja
--- /dev/null
+Description: Configuration for TX2 according to D33CompilerRuntime.docx
+
+--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
++++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+@@ -66,7 +66,7 @@
+ set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
+ endforeach()
+ else()
+- set(CUDA_ARCH -arch sm_35)
++ set(CUDA_ARCH -arch sm_62)
+ endif()
+
+ # Activate RTL message dumps if requested by the user.
+@@ -176,7 +176,7 @@
+ set(CUDA_ARCH ${CUDA_ARCH} --cuda-gpu-arch=sm_${sm})
+ endforeach()
+ else()
+- set(CUDA_ARCH --cuda-gpu-arch=sm_35)
++ set(CUDA_ARCH --cuda-gpu-arch=sm_62)
+ endif()
+
+ # Compile cuda files to bitcode.
+--- a/clang/lib/Driver/ToolChains.cpp
++++ b/clang/lib/Driver/ToolChains.cpp
+@@ -4967,7 +4967,7 @@
+ // macro for it. Also, select the default PTX version to be used. We use 4.2 for
+ // compute capabilities older than 6.0 and 5.0 otherwise.
+ #ifndef OPENMP_NVPTX_COMPUTE_CAPABILITY
+-#define OPENMP_NVPTX_COMPUTE_CAPABILITY 53
++#define OPENMP_NVPTX_COMPUTE_CAPABILITY 62
+ #endif
+
+ #if OPENMP_NVPTX_COMPUTE_CAPABILITY < 60
Guard-architecture-dependent-code-with-#ifdefs.patch
require-cuda.patch
configure-herculescompiler-via-cmake.patch
+use_gpuguard.patch
+cuda_arch.patch
--- /dev/null
+--- a/HerculesCompiler/libpremnotify/libpremnotify-gpu.cu
++++ b/HerculesCompiler/libpremnotify/libpremnotify-gpu.cu
+@@ -47,7 +47,7 @@
+ // SYNC: We can use GPUguard, synchronization, or (if none of the below macros
+ // are defined), no sync at all.
+ // Use GPUguard for synchronization
+-#define USE_GPUGUARD
++// #define USE_GPUGUARD
+
+ // Use barriers for synchronization
+ #define USE_BARRIERS
+--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
++++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+@@ -19,7 +19,7 @@
+ // kernel, and the LKM will be invoked for synchronization. If it is not
+ // defined, the offloading will be done as in the original libomptarget CUDA
+ // rtl.
+-#define USE_GPUGUARD
++// #define USE_GPUGUARD
+
+ // After each kernel, finalize the results to get output in deadline misses and
+ // enable the GPUguard LKM to output more statistics to the kernel log (if