1 --- a/HerculesCompiler/llvm-passes/CMakeLists.txt
2 +++ b/HerculesCompiler/llvm-passes/CMakeLists.txt
4 MESSAGE( STATUS "PROJECTS_MAIN_SRC_DIR = " ${PROJECTS_MAIN_SRC_DIR} )
5 MESSAGE( STATUS "PROJECTS_MAIN_INCLUDE_DIR = " ${PROJECTS_MAIN_INCLUDE_DIR} )
7 -include_directories(${PROJECTS_MAIN_INCLUDE_DIR})
8 +include_directories(${PROJECTS_MAIN_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/include)
10 +include(CMakeDependentOption)
12 +# // ----------------------------
13 +# // ---- CORE CONFIGURATION ----
14 +# // ----------------------------
16 +# // Amount of bytes available in the SPM or cache
17 +# // SPM (TX1, TX2) = 48000
18 +# // CACHE TX1 CPU = 524288
19 +# // CACHE TX1 GPU = 262144
20 +# // CACHE TX2 GPU = 524288
21 +# // CACHE TX2 CPU = 524288
22 +set(CPU_CACHE_SIZE_DEFAULT 524288 CACHE STRING "CPU cache size")
23 +set(GPU_CACHE_SIZE_DEFAULT 524288 CACHE STRING "GPU cache size")
24 +set(GPU_SCRATCHPAD_SIZE_DEFAULT 48000 CACHE STRING "GPU scratchpad size")
27 +option(USE_HW_CACHES "Use hardware cache prefetches in Specialized function, in place of SPM-based" OFF)
28 +option(HIERARCHICAL_INTERVALS "Use hierarchical PREM intervals for loops" OFF)
30 +# // -----------------------------
31 +# // ---- CACHE CONFIGURATION ----
32 +# // -----------------------------
34 +# // If we use hardware caches, we can either use load or prefetch instructions
35 +# // to bring the data local.
36 +set(PREFETCH_REPS 1 CACHE STRING "")
37 +cmake_dependent_option(USE_HW_CACHES_PREFETCH
38 + "Use prefetch instructions" ON "USE_HW_CACHES" ON)
39 +cmake_dependent_option(USE_HW_CACHES_INLINEPTX_PREFETCH
40 + "Use INLINEPTX prefetch" OFF "USE_HW_CACHES" OFF)
41 +cmake_dependent_option(USE_HW_CACHES_LIBCALL
42 + "Prefetch via library call" OFF "USE_HW_CACHES" OFF)
43 +cmake_dependent_option(USE_HW_CACHES_VOLALOAD
44 + "Prefetch via load instructions" OFF "USE_HW_CACHES" OFF)
46 +# // Use a single writeback function when using caches
47 +cmake_dependent_option(USE_HW_CACHES_SINGLEWRITEBACK
48 + "" OFF "USE_HW_CACHES" OFF)
49 +cmake_dependent_option(USE_HW_CACHES_INDWRITEBACK_LIBCALL
50 + "" ON "USE_HW_CACHES" ON)
51 +cmake_dependent_option(USE_HW_CACHES_INDWRITEBACK_INLINE
52 + "" OFF "USE_HW_CACHES" OFF)
54 +# // ---------------------------------------
55 +# // ---- CODE GENERATION CONFIGURATION ----
56 +# // ---------------------------------------
58 +# // Allow exporting of PREMized functions through external linkage (useful for
59 +# // compiling PREMized library functions).
60 +option(ULES_EXTERNAL_LINKAGE "Allow exporting of PREMized functions through external linkage (useful for compiling PREMized library functions)"
63 +# // Inlining of specialized functions
64 +option(ALWAYS_INLINE_UNSPECIALIZED "" ON)
65 +option(ALWAYS_INLINE_LOAD "" ON)
66 +option(ALWAYS_INLINE_EXECUTE "" ON)
67 +option(ALWAYS_INLINE_STORE "" ON)
69 +option(AGGRESSIVELY_INLINE_CALL_TREE "Inlining of call trees in Channel Arg Insertion" ON)
71 +# // -------------------------------------------------
72 +# // ---- ADVANCED OPTIONS ---- DEFAULTS ARE SANE ----
73 +# // -------------------------------------------------
75 +option(USE_PREM_INIT_FINI "Synchronization" ON)
77 +option(NEVER_USE_SOFTDMA "Use the new DMA-like loading of data for Load and/or Store phases" ON)
80 +option(SCHEDULE_COMPATIBLE "" OFF)
81 +option(SCHEDULE_LES "" ON)
82 +option(SCHEDULE_COMBINED "" OFF)
84 +set(NUM_THREADS_LES_IN_COMBINED 512 CACHE STRING "COMBINED schedule only: Number of threads to use for Specialized")
86 +set(CACHE_LINE_SIZE 128 CACHE STRING "The size in bytes of a cache line (128 bytes on TX1)")
88 +option(NEVER_INLINE_SYNC "Inlining of synchronization functions" OFF)
90 +configure_file(include/Config/Options.h.in include/Config/Options.h)
93 --- a/HerculesCompiler/llvm-passes/include/Config/Options.h
97 -// ----------------------------
98 -// ---- CORE CONFIGURATION ----
99 -// ----------------------------
101 -// Amount of bytes available in the SPM or cache
102 -// SPM (TX1, TX2) = 48000
103 -// CACHE TX1 CPU = 524288
104 -// CACHE TX1 GPU = 262144
105 -// CACHE TX2 GPU = 524288
106 -// CACHE TX2 CPU = 524288
107 -#define CPU_CACHE_SIZE_DEFAULT 524288
108 -#define GPU_CACHE_SIZE_DEFAULT 524288
109 -#define GPU_SCRATCHPAD_SIZE_DEFAULT 48000
111 -// Use hardware cache prefetches in Specialized function, in place of SPM-based.
112 -//#define USE_HW_CACHES
114 -// Use hierarchical PREM intervals for loops.
115 -#define HIERARCHICAL_INTERVALS
117 -// -----------------------------
118 -// ---- CACHE CONFIGURATION ----
119 -// -----------------------------
121 -// If we use hardware caches, we can either use load or prefetch instructions
122 -// to bring the data local.
123 -#define PREFETCH_REPS 1
124 -#define USE_HW_CACHES_PREFETCH
125 -//#define USE_HW_CACHES_INLINEPTX_PREFETCH
126 -//#define USE_HW_CACHES_LIBCALL
127 -//#define USE_HW_CACHES_VOLALOAD
129 -// Use a single writeback function when using caches
130 -//#define USE_HW_CACHES_SINGLEWRITEBACK
131 -#define USE_HW_CACHES_INDWRITEBACK_LIBCALL
132 -//#define USE_HW_CACHES_INDWRITEBACK_INLINE
134 -// ---------------------------------------
135 -// ---- CODE GENERATION CONFIGURATION ----
136 -// ---------------------------------------
138 -// Allow exporting of PREMized functions through external linkage (useful for
139 -// compiling PREMized library functions).
140 -//#define ULES_EXTERNAL_LINKAGE
142 -// Inlining of specialized functions
143 -#define ALWAYS_INLINE_UNSPECIALIZED
144 -#define ALWAYS_INLINE_LOAD
145 -#define ALWAYS_INLINE_EXECUTE
146 -#define ALWAYS_INLINE_STORE
148 -// Inlining of call trees in Channel Arg Insertion
149 -#define AGGRESSIVELY_INLINE_CALL_TREE
151 -// -------------------------------------------------
152 -// ---- ADVANCED OPTIONS ---- DEFAULTS ARE SANE ----
153 -// -------------------------------------------------
156 -#define USE_PREM_INIT_FINI
158 -// Use the new DMA-like loading of data for Load and/or Store phases
159 -#define NEVER_USE_SOFTDMA
162 -//#define SCHEDULE_COMPATIBLE
163 -#define SCHEDULE_LES
164 -//#define SCHEDULE_COMBINED
166 -// COMBINED schedule only: Number of threads to use for Specialized
167 -#define NUM_THREADS_LES_IN_COMBINED 512
169 -// The size in bytes of a cache line (128 bytes on TX1)
170 -#define CACHE_LINE_SIZE 128
172 -// Inlining of synchronization functions
173 -//#define NEVER_INLINE_SYNC
175 -// -------------------------------------------------------------------------- //
176 -// ---- DO NOT CHANGE BELOW THIS LINE ---- AUTOGENERATION BASED ON ABOVE ---- //
177 -// -------------------------------------------------------------------------- //
179 -// Check that schedule is sane.
180 -#if defined(SCHEDULE_COMPATIBLE) && \
181 - (defined(SCHEDULE_LES) || defined(SCHEDULE_COMBINED))
182 -#error Multiple schedules defined!
184 -#if defined(SCHEDULE_LES) && \
185 - (defined(SCHEDULE_COMPATIBLE) || defined(SCHEDULE_COMBINED))
186 -#error Multiple schedules defined!
188 -#if defined(SCHEDULE_COMBINED) && \
189 - (defined(SCHEDULE_LES) || defined(SCHEDULE_COMPATIBLE))
190 -#error Multiple schedules defined!
192 -#if !defined(SCHEDULE_COMPATIBLE) && !defined(SCHEDULE_LES) && \
193 - !defined(SCHEDULE_COMBINED)
194 -#error No schedule defined!
197 -// Check that the HW CACHE config is sane.
198 -#ifdef USE_HW_CACHES
199 -#define DONT_SPECIALIZE_EXECUTE
200 -//# if defined(USE_HW_CACHES_PREFETCH) && defined(USE_HW_CACHES_VOLALOAD)
201 -//# error Using both volatile loads and prefetches for HW CACHE mode.
203 -#if !defined(USE_HW_CACHES_PREFETCH) && !defined(USE_HW_CACHES_VOLALOAD) && \
204 - !defined(USE_HW_CACHES_INLINEPTX_PREFETCH) && \
205 - !defined(USE_HW_CACHES_LIBCALL)
206 -#error No policy for HW caches defined!
208 -#if defined(USE_HW_CACHES_INLINEPTX_PREFETCH) && \
209 - (defined(USE_HW_CACHES_PREFETCH) || defined(USE_HW_CACHES_VOLALOAD))
210 -#error Multiple cache policies defined!
212 -#if defined(USE_HW_CACHES_PREFETCH) && \
213 - (defined(USE_HW_CACHES_INLINEPTX_PREFETCH) || \
214 - defined(USE_HW_CACHES_VOLALOAD))
215 -#error Multiple cache policies defined!
217 -#if defined(USE_HW_CACHES_VOLALOAD) && \
218 - (defined(USE_HW_CACHES_PREFETCH) || \
219 - defined(USE_HW_CACHES_INLINEPTX_PREFETCH))
220 -#error Multiple cache policies defined!
223 -#if defined(USE_HW_CACHES_SINGLEWRITEBACK) && \
224 - defined(USE_HW_CACHES_INDWRITEBACK_LIBCALL) && \
225 - defined(USE_HW_CACHES_INDWRITEBACK_INLINE)
226 -#error Multiple cache writeback policies!
227 -#elif defined(USE_HW_CACHES_INDWRITEBACK_LIBCALL) && \
228 - defined(USE_HW_CACHES_INDWRITEBACK_INLINE)
229 -#error Multiple cache writeback policies!
230 -#elif defined(USE_HW_CACHES_SINGLEWRITEBACK) && \
231 - defined(USE_HW_CACHES_INDWRITEBACK_INLINE)
232 -#error Multiple cache writeback policies!
233 -#elif defined(USE_HW_CACHES_SINGLEWRITEBACK) && \
234 - defined(USE_HW_CACHES_INDWRITEBACK_LIBCALL)
235 -#error Multiple cache writeback policies!
240 -#ifdef EMPTY_COMPUTE_PHASE
241 -#warning Empty compute phase!
244 +++ b/HerculesCompiler/llvm-passes/include/Config/Options.h.in
247 +// ----------------------------
248 +// ---- CORE CONFIGURATION ----
249 +// ----------------------------
251 +// Amount of bytes available in the SPM or cache
252 +// SPM (TX1, TX2) = 48000
253 +// CACHE TX1 CPU = 524288
254 +// CACHE TX1 GPU = 262144
255 +// CACHE TX2 GPU = 524288
256 +// CACHE TX2 CPU = 524288
257 +#cmakedefine CPU_CACHE_SIZE_DEFAULT @GPU_CACHE_SIZE_DEFAULT@
258 +#cmakedefine GPU_CACHE_SIZE_DEFAULT @CPU_CACHE_SIZE_DEFAULT@
259 +#cmakedefine GPU_SCRATCHPAD_SIZE_DEFAULT @GPU_SCRATCHPAD_SIZE_DEFAULT@
261 +// Use hardware cache prefetches in Specialized function, in place of SPM-based.
262 +#cmakedefine USE_HW_CACHES
264 +// Use hierarchical PREM intervals for loops.
265 +#cmakedefine HIERARCHICAL_INTERVALS
267 +// -----------------------------
268 +// ---- CACHE CONFIGURATION ----
269 +// -----------------------------
271 +// If we use hardware caches, we can either use load or prefetch instructions
272 +// to bring the data local.
273 +#cmakedefine PREFETCH_REPS @PREFETCH_REPS@
274 +#cmakedefine USE_HW_CACHES_PREFETCH
275 +#cmakedefine USE_HW_CACHES_INLINEPTX_PREFETCH
276 +#cmakedefine USE_HW_CACHES_LIBCALL
277 +#cmakedefine USE_HW_CACHES_VOLALOAD
279 +// Use a single writeback function when using caches
280 +#cmakedefine USE_HW_CACHES_SINGLEWRITEBACK
281 +#cmakedefine USE_HW_CACHES_INDWRITEBACK_LIBCALL
282 +#cmakedefine USE_HW_CACHES_INDWRITEBACK_INLINE
284 +// ---------------------------------------
285 +// ---- CODE GENERATION CONFIGURATION ----
286 +// ---------------------------------------
288 +// Allow exporting of PREMized functions through external linkage (useful for
289 +// compiling PREMized library functions).
290 +#cmakedefine ULES_EXTERNAL_LINKAGE
292 +// Inlining of specialized functions
293 +#cmakedefine ALWAYS_INLINE_UNSPECIALIZED
294 +#cmakedefine ALWAYS_INLINE_LOAD
295 +#cmakedefine ALWAYS_INLINE_EXECUTE
296 +#cmakedefine ALWAYS_INLINE_STORE
298 +// Inlining of call trees in Channel Arg Insertion
299 +#cmakedefine AGGRESSIVELY_INLINE_CALL_TREE
301 +// -------------------------------------------------
302 +// ---- ADVANCED OPTIONS ---- DEFAULTS ARE SANE ----
303 +// -------------------------------------------------
306 +#cmakedefine USE_PREM_INIT_FINI
308 +// Use the new DMA-like loading of data for Load and/or Store phases
309 +#cmakedefine NEVER_USE_SOFTDMA
312 +#cmakedefine SCHEDULE_COMPATIBLE
313 +#cmakedefine SCHEDULE_LES
314 +#cmakedefine SCHEDULE_COMBINED
316 +// COMBINED schedule only: Number of threads to use for Specialized
317 +#cmakedefine NUM_THREADS_LES_IN_COMBINED @NUM_THREADS_LES_IN_COMBINED@
319 +// The size in bytes of a cache line (128 bytes on TX1)
320 +#cmakedefine CACHE_LINE_SIZE @CACHE_LINE_SIZE@
322 +// Inlining of synchronization functions
323 +#cmakedefine NEVER_INLINE_SYNC
325 +// -------------------------------------------------------------------------- //
326 +// ---- DO NOT CHANGE BELOW THIS LINE ---- AUTOGENERATION BASED ON ABOVE ---- //
327 +// -------------------------------------------------------------------------- //
329 +// Check that schedule is sane.
330 +#if defined(SCHEDULE_COMPATIBLE) && \
331 + (defined(SCHEDULE_LES) || defined(SCHEDULE_COMBINED))
332 +#error Multiple schedules defined!
334 +#if defined(SCHEDULE_LES) && \
335 + (defined(SCHEDULE_COMPATIBLE) || defined(SCHEDULE_COMBINED))
336 +#error Multiple schedules defined!
338 +#if defined(SCHEDULE_COMBINED) && \
339 + (defined(SCHEDULE_LES) || defined(SCHEDULE_COMPATIBLE))
340 +#error Multiple schedules defined!
342 +#if !defined(SCHEDULE_COMPATIBLE) && !defined(SCHEDULE_LES) && \
343 + !defined(SCHEDULE_COMBINED)
344 +#error No schedule defined!
347 +// Check that the HW CACHE config is sane.
348 +#ifdef USE_HW_CACHES
349 +#define DONT_SPECIALIZE_EXECUTE
350 +//# if defined(USE_HW_CACHES_PREFETCH) && defined(USE_HW_CACHES_VOLALOAD)
351 +//# error Using both volatile loads and prefetches for HW CACHE mode.
353 +#if !defined(USE_HW_CACHES_PREFETCH) && !defined(USE_HW_CACHES_VOLALOAD) && \
354 + !defined(USE_HW_CACHES_INLINEPTX_PREFETCH) && \
355 + !defined(USE_HW_CACHES_LIBCALL)
356 +#error No policy for HW caches defined!
358 +#if defined(USE_HW_CACHES_INLINEPTX_PREFETCH) && \
359 + (defined(USE_HW_CACHES_PREFETCH) || defined(USE_HW_CACHES_VOLALOAD))
360 +#error Multiple cache policies defined!
362 +#if defined(USE_HW_CACHES_PREFETCH) && \
363 + (defined(USE_HW_CACHES_INLINEPTX_PREFETCH) || \
364 + defined(USE_HW_CACHES_VOLALOAD))
365 +#error Multiple cache policies defined!
367 +#if defined(USE_HW_CACHES_VOLALOAD) && \
368 + (defined(USE_HW_CACHES_PREFETCH) || \
369 + defined(USE_HW_CACHES_INLINEPTX_PREFETCH))
370 +#error Multiple cache policies defined!
373 +#if defined(USE_HW_CACHES_SINGLEWRITEBACK) && \
374 + defined(USE_HW_CACHES_INDWRITEBACK_LIBCALL) && \
375 + defined(USE_HW_CACHES_INDWRITEBACK_INLINE)
376 +#error Multiple cache writeback policies!
377 +#elif defined(USE_HW_CACHES_INDWRITEBACK_LIBCALL) && \
378 + defined(USE_HW_CACHES_INDWRITEBACK_INLINE)
379 +#error Multiple cache writeback policies!
380 +#elif defined(USE_HW_CACHES_SINGLEWRITEBACK) && \
381 + defined(USE_HW_CACHES_INDWRITEBACK_INLINE)
382 +#error Multiple cache writeback policies!
383 +#elif defined(USE_HW_CACHES_SINGLEWRITEBACK) && \
384 + defined(USE_HW_CACHES_INDWRITEBACK_LIBCALL)
385 +#error Multiple cache writeback policies!
390 +#ifdef EMPTY_COMPUTE_PHASE
391 +#warning Empty compute phase!