6 #define CMDLINE_BUFFER_SIZE 256
\r
7 CMDLINE_BUFFER(CMDLINE_BUFFER_SIZE);
\r
9 #define POLLUTE_CACHE_SIZE (512 * 1024)
\r
11 #ifdef CONFIG_UART_OXPCIE952
\r
12 #define UART_BASE 0xe010
\r
14 #define UART_BASE 0x3f8
\r
16 #define UART_LSR 0x5
\r
17 #define UART_LSR_THRE 0x20
\r
18 #define UART_IDLE_LOOPS 100
\r
20 //uintstd in jailhouse way
\r
21 #define uint32_t u32
\r
22 #define uint64_t u64
\r
24 #define perror(FUNC) (printk(#FUNC))
\r
26 #define printf printk
\r
28 #define exit(SIG) { printk("exit with %d\n", SIG); asm volatile("hlt");}
\r
30 //-----Time-and-and-randomization-overrides-------------------------------
\r
31 static inline unsigned long time(unsigned long * seconds)
\r
33 return (*seconds) = tsc_read();
\r
37 * Tables of Maximally-Equidistributed Combined Lfsr Generators (1998)
\r
38 * by Pierre L'Ecuyer
\r
39 * taken from: http://stackoverflow.com/questions/1167253/implementation-of-rand
\r
41 static unsigned int z1 = 12345, z2 = 12345, z3 = 12345, z4 = 12345;
\r
43 static unsigned int lfsr113_Bits (void)
\r
46 b = ((z1 << 6) ^ z1) >> 13;
\r
47 z1 = ((z1 & 4294967294U) << 18) ^ b;
\r
48 b = ((z2 << 2) ^ z2) >> 27;
\r
49 z2 = ((z2 & 4294967288U) << 2) ^ b;
\r
50 b = ((z3 << 13) ^ z3) >> 21;
\r
51 z3 = ((z3 & 4294967280U) << 7) ^ b;
\r
52 b = ((z4 << 3) ^ z4) >> 12;
\r
53 z4 = ((z4 & 4294967168U) << 13) ^ b;
\r
54 return (z1 ^ z2 ^ z3 ^ z4);
\r
57 static unsigned int rand(void)
\r
59 return lfsr113_Bits();
\r
62 static void srand(unsigned int seed)
\r
64 //seed into z1 z2 z3 z4
\r
65 z1 = z2 = z3 = z4 = seed;
\r
67 //-END:-Time-and-and-randomization-overrides-------------------------------
\r
69 //-----Assertion-overrides-------------------------------------------------
\r
73 # define assert(EX) (void)((EX) || (__assert (#EX, __FILE__, __LINE__),0))
\r
76 static inline void __assert(const char *msg, const char *file, int line)
\r
78 printk("Assertion %s in %s:%s failed.\n", msg, file, line);
\r
80 //-END:-Assertion-overrides-------------------------------------------------
\r
82 //------Threads-overrides---------------------------------------------------
\r
89 typedef u32 pthread_t;
\r
90 typedef u32 pthread_barrier_t;
\r
91 typedef u32 pthread_attr_t;
\r
93 static inline void CPU_ZERO(cpu_set_t *set) { set->bits = 0; set->count = 0; }
\r
94 static inline void CPU_SET(int cpu, cpu_set_t *set) { set->bits |= (1 << cpu); set->count++; }
\r
95 static inline void CPU_CLR(int cpu, cpu_set_t *set) { set->bits ^= (1 << cpu); set->count--; }
\r
96 static inline int CPU_ISSET(int cpu, cpu_set_t *set) { return (set->bits & (1 << cpu)); }
\r
97 static inline int CPU_COUNT(cpu_set_t *set) { return (int) set->count; }
\r
99 static inline int pthread_setaffinity_np(pthread_t id, unsigned long affty, cpu_set_t * set) {return 0;}
\r
101 static inline void pthread_barrier_init(pthread_barrier_t * bar, pthread_attr_t * attr, unsigned count) {}
\r
102 static inline void pthread_barrier_wait(pthread_barrier_t * bar) {}
\r
103 static inline void pthread_barrier_destroy(pthread_barrier_t * bar) {}
\r
104 static inline void pthread_join(pthread_t id, void * smth){}
\r
106 static int pthread_create(pthread_t *thread, const pthread_attr_t *attr,
\r
107 void *(*start_routine) (void *), void *arg)
\r
109 start_routine(arg);
\r
113 //-END:-Threads-overrides---------------------------------------------------
\r
115 //just remove fflush
\r
116 #define fflush (void)sizeof
\r
121 #else //END JAILHOUSE
\r
123 #define _GNU_SOURCE
\r
124 #include <assert.h>
\r
125 #include <pthread.h>
\r
127 #include <stdbool.h>
\r
128 #include <stdint.h>
\r
130 #include <stdlib.h>
\r
131 #include <string.h>
\r
132 #include <sys/types.h>
\r
134 #include <unistd.h>
\r
138 #define STRINGIFY(val) #val
\r
139 #define TOSTRING(val) STRINGIFY(val)
\r
140 #define LOC __FILE__ ":" TOSTRING(__LINE__) ": "
\r
142 #define CHECK(cmd) ({ int ret = (cmd); if (ret == -1) { perror(LOC #cmd); exit(1); }; ret; })
\r
143 #define CHECKPTR(cmd) ({ void *ptr = (cmd); if (ptr == (void*)-1) { perror(LOC #cmd); exit(1); }; ptr; })
\r
144 #define CHECKNULL(cmd) ({ typeof(cmd) ptr = (cmd); if (ptr == NULL) { perror(LOC #cmd); exit(1); }; ptr; })
\r
145 //#define CHECKFGETS(s, size, stream) ({ void *ptr = fgets(s, size, stream); if (ptr == NULL) { if (feof(stream)) fprintf(stderr, LOC "fgets(" #s "): Unexpected end of stream\n"); else perror(LOC "fgets(" #s ")"); exit(1); }; ptr; })
\r
146 #define CHECKTRUE(bool, msg) ({ if (!(bool)) { printf("Error: " msg "\n"); exit(1); }; })
\r
151 unsigned num_threads;
\r
152 unsigned read_count;
\r
156 bool use_cycles; /* instead of ns */
\r
161 uint32_t dummy[(64 - sizeof(struct s*))/sizeof(uint32_t)];
\r
164 _Static_assert(sizeof(struct s) == 64, "Struct size differs from cacheline size");
\r
169 #define MRS32(reg) ({ uint32_t v; asm volatile ("mrs %0," # reg : "=r" (v)); v; })
\r
170 #define MRS64(reg) ({ uint64_t v; asm volatile ("mrs %0," # reg : "=r" (v)); v; })
\r
172 #define MSR(reg, v) ({ asm volatile ("msr " # reg ",%0" :: "r" (v)); })
\r
174 static void ccntr_init(void)
\r
176 MSR(PMCNTENSET_EL0, 0x80000000);
\r
177 MSR(PMCR_EL0, MRS32(PMCR_EL0) | 1);
\r
180 static uint64_t ccntr_get(void)
\r
182 return MRS64(PMCCNTR_EL0);
\r
185 static void ccntr_init(void) {}
\r
187 static uint64_t ccntr_get(void)
\r
190 //taken from lib/timing.c
\r
194 asm volatile("rdtsc" : "=a" (lo), "=d" (hi));
\r
195 return (u64)lo | (((u64)hi) << 32);
\r
199 asm volatile("rdtsc" : "=A" (v));
\r
209 static uint64_t get_time(struct cfg *cfg)
\r
211 if (cfg->use_cycles == false) {
\r
218 clock_gettime(CLOCK_MONOTONIC, &t);
\r
219 return (uint64_t)t.tv_sec * 1000000000 + t.tv_nsec;
\r
223 return ccntr_get();
\r
227 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
\r
229 static void prepare(struct s *array, unsigned size, bool sequential)
\r
232 int count = size / sizeof(struct s);
\r
235 for (i = 0; i < count - 1; i++)
\r
236 array[i].ptr = &array[i+1];
\r
237 array[count - 1].ptr = &array[0];
\r
239 memset(array, 0, size);
\r
240 struct s *p = &array[0];
\r
241 for (i = 0; i < count - 1; i++) {
\r
242 p->ptr = (struct s*)1; /* Mark as occupied to avoid self-loop */
\r
243 for (j = rand() % count;
\r
244 array[j].ptr != NULL;
\r
245 j = (j >= count) ? 0 : j+1);
\r
246 p = p->ptr = &array[j];
\r
248 p->ptr = &array[0];
\r
252 static void do_read(struct s *array, unsigned reads)
\r
254 unsigned i = reads / 32;
\r
255 volatile struct s *p = &array[0];
\r
257 p = p->ptr; /* 0 */
\r
258 p = p->ptr; /* 1 */
\r
259 p = p->ptr; /* 2 */
\r
260 p = p->ptr; /* 3 */
\r
261 p = p->ptr; /* 4 */
\r
262 p = p->ptr; /* 5 */
\r
263 p = p->ptr; /* 6 */
\r
264 p = p->ptr; /* 7 */
\r
265 p = p->ptr; /* 8 */
\r
266 p = p->ptr; /* 9 */
\r
267 p = p->ptr; /* 10 */
\r
268 p = p->ptr; /* 11 */
\r
269 p = p->ptr; /* 12 */
\r
270 p = p->ptr; /* 13 */
\r
271 p = p->ptr; /* 14 */
\r
272 p = p->ptr; /* 15 */
\r
273 p = p->ptr; /* 16 */
\r
274 p = p->ptr; /* 17 */
\r
275 p = p->ptr; /* 18 */
\r
276 p = p->ptr; /* 19 */
\r
277 p = p->ptr; /* 20 */
\r
278 p = p->ptr; /* 21 */
\r
279 p = p->ptr; /* 22 */
\r
280 p = p->ptr; /* 23 */
\r
281 p = p->ptr; /* 24 */
\r
282 p = p->ptr; /* 25 */
\r
283 p = p->ptr; /* 26 */
\r
284 p = p->ptr; /* 27 */
\r
285 p = p->ptr; /* 28 */
\r
286 p = p->ptr; /* 29 */
\r
287 p = p->ptr; /* 30 */
\r
288 p = p->ptr; /* 31 */
\r
292 static void do_write(struct s *array, unsigned accesses, unsigned ofs)
\r
294 unsigned i = accesses / 32;
\r
295 volatile struct s *p = &array[0];
\r
297 p->dummy[ofs]++; p = p->ptr; /* 0 */
\r
298 p->dummy[ofs]++; p = p->ptr; /* 1 */
\r
299 p->dummy[ofs]++; p = p->ptr; /* 2 */
\r
300 p->dummy[ofs]++; p = p->ptr; /* 3 */
\r
301 p->dummy[ofs]++; p = p->ptr; /* 4 */
\r
302 p->dummy[ofs]++; p = p->ptr; /* 5 */
\r
303 p->dummy[ofs]++; p = p->ptr; /* 6 */
\r
304 p->dummy[ofs]++; p = p->ptr; /* 7 */
\r
305 p->dummy[ofs]++; p = p->ptr; /* 8 */
\r
306 p->dummy[ofs]++; p = p->ptr; /* 9 */
\r
307 p->dummy[ofs]++; p = p->ptr; /* 10 */
\r
308 p->dummy[ofs]++; p = p->ptr; /* 11 */
\r
309 p->dummy[ofs]++; p = p->ptr; /* 12 */
\r
310 p->dummy[ofs]++; p = p->ptr; /* 13 */
\r
311 p->dummy[ofs]++; p = p->ptr; /* 14 */
\r
312 p->dummy[ofs]++; p = p->ptr; /* 15 */
\r
313 p->dummy[ofs]++; p = p->ptr; /* 16 */
\r
314 p->dummy[ofs]++; p = p->ptr; /* 17 */
\r
315 p->dummy[ofs]++; p = p->ptr; /* 18 */
\r
316 p->dummy[ofs]++; p = p->ptr; /* 19 */
\r
317 p->dummy[ofs]++; p = p->ptr; /* 20 */
\r
318 p->dummy[ofs]++; p = p->ptr; /* 21 */
\r
319 p->dummy[ofs]++; p = p->ptr; /* 22 */
\r
320 p->dummy[ofs]++; p = p->ptr; /* 23 */
\r
321 p->dummy[ofs]++; p = p->ptr; /* 24 */
\r
322 p->dummy[ofs]++; p = p->ptr; /* 25 */
\r
323 p->dummy[ofs]++; p = p->ptr; /* 26 */
\r
324 p->dummy[ofs]++; p = p->ptr; /* 27 */
\r
325 p->dummy[ofs]++; p = p->ptr; /* 28 */
\r
326 p->dummy[ofs]++; p = p->ptr; /* 29 */
\r
327 p->dummy[ofs]++; p = p->ptr; /* 30 */
\r
328 p->dummy[ofs]++; p = p->ptr; /* 31 */
\r
332 struct benchmark_thread {
\r
339 pthread_barrier_t barrier;
\r
341 struct s array[MAX_CPUS][64*0x100000/sizeof(struct s)] __attribute__ ((aligned (2*1024*1024)));
\r
345 static void *benchmark_thread(void *arg)
\r
347 struct benchmark_thread *me = arg;
\r
351 CPU_SET(me->cpu, &set);
\r
353 if (pthread_setaffinity_np(me->id, sizeof(set), &set) != 0) {
\r
354 perror("pthread_setaffinity_np");
\r
358 prepare(array[me->cpu], me->cfg->size, me->cfg->sequential);
\r
360 pthread_barrier_wait(&barrier);
\r
363 printf("CPU %d starts measurement\n", me->cpu);
\r
366 tic = get_time(me->cfg);
\r
367 if (me->cfg->write == false)
\r
368 do_read(array[me->cpu], me->cfg->read_count);
\r
370 do_write(array[me->cpu], me->cfg->read_count, me->cfg->ofs);
\r
372 tac = get_time(me->cfg);
\r
373 me->result = (double)(tac - tic) / me->cfg->read_count;
\r
378 static void run_benchmark(struct cfg *cfg)
\r
380 struct benchmark_thread thread[MAX_CPUS];
\r
382 cpu_set_t set = cfg->cpu_set;
\r
383 pthread_barrier_init(&barrier, NULL, cfg->num_threads);
\r
384 for (i = 0; i < cfg->num_threads; i++) {
\r
385 thread[i].cfg = cfg;
\r
386 if (CPU_COUNT(&set) == 0) {
\r
390 for (j = 0; j < MAX_CPUS; j++) {
\r
391 if (CPU_ISSET(j, &set)) {
\r
399 printf( "Running thread %d on CPU %d\n", i, thread[i].cpu);
\r
400 pthread_create(&thread[i].id, NULL, benchmark_thread, &thread[i]);
\r
403 for (i = 0; i < cfg->num_threads; i++) {
\r
404 pthread_join(thread[i].id, NULL);
\r
406 pthread_barrier_destroy(&barrier);
\r
408 printf("%d", cfg->size);
\r
409 for (i = 0; i < cfg->num_threads; i++) {
\r
410 printf("\t%#.3g", thread[i].result);
\r
418 void inmate_main(void)
\r
420 int main(int argc, char *argv[])
\r
424 .sequential = true,
\r
427 .read_count = 0x2000000,
\r
430 .use_cycles = false, /* i.e. use nanoseconds */
\r
435 unsigned long tsc_freq;
\r
438 printk_uart_base = UART_BASE;
\r
440 for (n = 0; n < UART_IDLE_LOOPS; n++)
\r
441 if (!(inb(UART_BASE + UART_LSR) & UART_LSR_THRE))
\r
443 } while (n < UART_IDLE_LOOPS);
\r
445 cfg.read_count = cmdline_parse_int("-c", cfg.read_count);
\r
446 cfg.ofs = cmdline_parse_int("-o", cfg.ofs);
\r
447 cfg.sequential = cmdline_parse_bool("-r");
\r
448 cfg.size = cmdline_parse_int("-s", cfg.size);
\r
449 if (cmdline_parse_bool("-t")) {
\r
450 printk("Threads are not supported. '-t' was ignored.\n");
\r
452 if (cmdline_parse_bool("-C")) {
\r
453 printk("CPU selection is not supported. '-C' was ignored.\n");
\r
455 cfg.write = cmdline_parse_bool("-w");
\r
456 cfg.use_cycles = cmdline_parse_bool("-y");
\r
457 //initialize timing
\r
458 tsc_freq = tsc_init();
\r
459 printk("Calibrated TSC frequency: %lu.%03u kHz\n", tsc_freq / 1000,
\r
461 #else //Linux param's parsing
\r
462 CPU_ZERO(&cfg.cpu_set);
\r
465 while ((opt = getopt(argc, argv, "c:C:o:rs:t:wy")) != -1) {
\r
468 cfg.read_count = atol(optarg);
\r
471 cfg.ofs = atol(optarg);
\r
473 case 'r': /* random */
\r
474 cfg.sequential = false;
\r
477 cfg.size = atol(optarg);
\r
478 assert(cfg.size <= sizeof(array[0]));
\r
481 cfg.num_threads = atol(optarg);
\r
484 CPU_SET(atol(optarg), &cfg.cpu_set);
\r
490 cfg.use_cycles = true;
\r
493 fprintf(stderr, "Usage: %s ... TODO\n", argv[0]);
\r
503 assert(cfg.ofs < ARRAY_SIZE(s.dummy));
\r
506 if (cfg.use_cycles)
\r
509 if (cfg.size != 0) {
\r
510 run_benchmark(&cfg);
\r
512 unsigned order, size, step;
\r
513 for (order = 10; order <= 24; order++) {
\r
514 for (step = 0; step < 2; step++) {
\r
520 run_benchmark(&cfg);
\r