1 /*--------------------------------------------------------------------*/
2 /*--- Cache simulation. ---*/
4 /*--------------------------------------------------------------------*/
7 This file is part of Callgrind, a Valgrind tool for call graph
10 Copyright (C) 2003-2010, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
12 This tool is derived from and contains code from Cachegrind
13 Copyright (C) 2002-2010 Nicholas Nethercote (njn@valgrind.org)
15 This program is free software; you can redistribute it and/or
16 modify it under the terms of the GNU General Public License as
17 published by the Free Software Foundation; either version 2 of the
18 License, or (at your option) any later version.
20 This program is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 General Public License for more details.
25 You should have received a copy of the GNU General Public License
26 along with this program; if not, write to the Free Software
27 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
30 The GNU General Public License is contained in the file COPYING.
37 - simulates a write-allocate cache
38 - (block --> set) hash function uses simple bit selection
39 - handling of references straddling two cache blocks:
40 - counts as only one cache access (not two)
41 - both blocks hit --> one hit
42 - one block hits, the other misses --> one miss
43 - both blocks miss --> one miss (not two)
46 /* Cache configuration */
49 /* additional structures for cache use info, separated
50 * according usage frequency:
51 * - line_loaded : pointer to cost center of instruction
52 * which loaded the line into cache.
53 * Needed to increment counters when line is evicted.
54 * - line_use : updated on every access
58 UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
63 line_use* dep_use; /* point to higher-level cacheblock for this memline */
72 int line_size; /* bytes */
73 Bool sectored; /* prefetch nearside cacheline on read */
91 * States of flat caches in our model.
92 * We use a 2-level hierarchy,
94 static cache_t2 I1, D1, L2;
96 /* Lower bits of cache tags are used as flags for a cache line */
97 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
98 #define CACHELINE_DIRTY 1
101 /* Cache simulator Options */
102 static Bool clo_simulate_writeback = False;
103 static Bool clo_simulate_hwpref = False;
104 static Bool clo_simulate_sectors = False;
105 static Bool clo_collect_cacheuse = False;
107 /* Following global vars are setup before by setup_bbcc():
109 * - Addr CLG_(bb_base) (instruction start address of original BB)
110 * - ULong* CLG_(cost_base) (start of cost array for BB)
114 ULong* CLG_(cost_base);
116 static InstrInfo* current_ii;
118 /* Cache use offsets */
119 /* The offsets are only correct because all per-instruction event sets get
120 * the "Use" set added first !
122 static Int off_I1_AcCost = 0;
123 static Int off_I1_SpLoss = 1;
124 static Int off_D1_AcCost = 0;
125 static Int off_D1_SpLoss = 1;
126 static Int off_L2_AcCost = 2;
127 static Int off_L2_SpLoss = 3;
129 /* Cache access types */
130 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
132 /* Result of a reference into a flat cache */
133 typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
135 /* Result of a reference into a hierarchical cache model */
140 WriteBackMemAccess } CacheModelResult;
142 typedef CacheModelResult (*simcall_type)(Addr, UChar);
145 simcall_type I1_Read;
146 simcall_type D1_Read;
147 simcall_type D1_Write;
150 /*------------------------------------------------------------*/
151 /*--- Cache Simulator Initialization ---*/
152 /*------------------------------------------------------------*/
154 static void cachesim_clearcache(cache_t2* c)
158 for (i = 0; i < c->sets * c->assoc; i++)
161 for (i = 0; i < c->sets * c->assoc; i++) {
162 c->loaded[i].memline = 0;
163 c->loaded[i].use_base = 0;
164 c->loaded[i].dep_use = 0;
165 c->loaded[i].iaddr = 0;
168 c->tags[i] = i % c->assoc; /* init lower bits as pointer */
173 static void cacheuse_initcache(cache_t2* c);
175 /* By this point, the size/assoc/line_size has been checked. */
176 static void cachesim_initcache(cache_t config, cache_t2* c)
178 c->size = config.size;
179 c->assoc = config.assoc;
180 c->line_size = config.line_size;
181 c->sectored = False; // FIXME
183 c->sets = (c->size / c->line_size) / c->assoc;
184 c->sets_min_1 = c->sets - 1;
185 c->line_size_bits = VG_(log2)(c->line_size);
186 c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
187 c->tag_mask = ~((1<<c->tag_shift)-1);
189 /* Can bits in tag entries be used for flags?
190 * Should be always true as MIN_LINE_SIZE >= 16 */
191 CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
194 VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
195 c->size, c->line_size,
196 c->sectored ? ", sectored":"");
198 VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
199 c->size, c->line_size, c->assoc,
200 c->sectored ? ", sectored":"");
203 c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
204 sizeof(UWord) * c->sets * c->assoc);
205 if (clo_collect_cacheuse)
206 cacheuse_initcache(c);
209 cachesim_clearcache(c);
214 static void print_cache(cache_t2* c)
218 /* Note initialisation and update of 'i'. */
219 for (i = 0, set = 0; set < c->sets; set++) {
220 for (way = 0; way < c->assoc; way++, i++) {
221 VG_(printf)("%8x ", c->tags[i]);
229 /*------------------------------------------------------------*/
230 /*--- Write Through Cache Simulation ---*/
231 /*------------------------------------------------------------*/
234 * Simple model: L1 & L2 Write Through
235 * Does not distinguish among read and write references
237 * Simulator functions:
238 * CacheModelResult cachesim_I1_ref(Addr a, UChar size)
239 * CacheModelResult cachesim_D1_ref(Addr a, UChar size)
243 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
248 set = &(c->tags[set_no * c->assoc]);
250 /* This loop is unrolled for just the first case, which is the most */
251 /* common. We can't unroll any further because it would screw up */
252 /* if we have a direct-mapped (1-way) cache. */
256 /* If the tag is one other than the MRU, move it into the MRU spot */
257 /* and shuffle the rest down. */
258 for (i = 1; i < c->assoc; i++) {
260 for (j = i; j > 0; j--) {
268 /* A miss; install this tag as MRU, shuffle rest down. */
269 for (j = c->assoc - 1; j > 0; j--) {
277 static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
279 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
280 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
281 UWord tag = a >> c->tag_shift;
283 /* Access entirely within line. */
285 return cachesim_setref(c, set1, tag);
287 /* Access straddles two lines. */
288 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
289 else if (((set1 + 1) & (c->sets-1)) == set2) {
290 UWord tag2 = (a+size-1) >> c->tag_shift;
292 /* the call updates cache structures as side effect */
293 CacheResult res1 = cachesim_setref(c, set1, tag);
294 CacheResult res2 = cachesim_setref(c, set2, tag2);
295 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
298 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
299 VG_(tool_panic)("item straddles more than two cache sets");
305 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
307 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
308 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
313 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
315 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
316 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
321 /*------------------------------------------------------------*/
322 /*--- Write Back Cache Simulation ---*/
323 /*------------------------------------------------------------*/
326 * More complex model: L1 Write-through, L2 Write-back
327 * This needs to distinguish among read and write references.
329 * Simulator functions:
330 * CacheModelResult cachesim_I1_Read(Addr a, UChar size)
331 * CacheModelResult cachesim_D1_Read(Addr a, UChar size)
332 * CacheModelResult cachesim_D1_Write(Addr a, UChar size)
336 * With write-back, result can be a miss evicting a dirty line
337 * The dirty state of a cache line is stored in Bit0 of the tag for
338 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
339 * type (Read/Write), the line gets dirty on a write.
342 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
347 set = &(c->tags[set_no * c->assoc]);
349 /* This loop is unrolled for just the first case, which is the most */
350 /* common. We can't unroll any further because it would screw up */
351 /* if we have a direct-mapped (1-way) cache. */
352 if (tag == (set[0] & ~CACHELINE_DIRTY)) {
356 /* If the tag is one other than the MRU, move it into the MRU spot */
357 /* and shuffle the rest down. */
358 for (i = 1; i < c->assoc; i++) {
359 if (tag == (set[i] & ~CACHELINE_DIRTY)) {
360 tmp_tag = set[i] | ref; // update dirty flag
361 for (j = i; j > 0; j--) {
369 /* A miss; install this tag as MRU, shuffle rest down. */
370 tmp_tag = set[c->assoc - 1];
371 for (j = c->assoc - 1; j > 0; j--) {
376 return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
381 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
383 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
384 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
385 UWord tag = a & c->tag_mask;
387 /* Access entirely within line. */
389 return cachesim_setref_wb(c, ref, set1, tag);
391 /* Access straddles two lines. */
392 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
393 else if (((set1 + 1) & (c->sets-1)) == set2) {
394 UWord tag2 = (a+size-1) & c->tag_mask;
396 /* the call updates cache structures as side effect */
397 CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
398 CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
400 if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
401 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
404 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2);
405 VG_(tool_panic)("item straddles more than two cache sets");
412 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
414 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
415 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
416 case Hit: return L2_Hit;
417 case Miss: return MemAccess;
420 return WriteBackMemAccess;
424 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
426 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
427 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
428 case Hit: return L2_Hit;
429 case Miss: return MemAccess;
432 return WriteBackMemAccess;
436 CacheModelResult cachesim_D1_Write(Addr a, UChar size)
438 if ( cachesim_ref( &D1, a, size) == Hit ) {
439 /* Even for a L1 hit, the write-trough L1 passes
440 * the write to the L2 to make the L2 line dirty.
441 * But this causes no latency, so return the hit.
443 cachesim_ref_wb( &L2, Write, a, size);
446 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
447 case Hit: return L2_Hit;
448 case Miss: return MemAccess;
451 return WriteBackMemAccess;
455 /*------------------------------------------------------------*/
456 /*--- Hardware Prefetch Simulation ---*/
457 /*------------------------------------------------------------*/
459 static ULong prefetch_up = 0;
460 static ULong prefetch_down = 0;
463 #define PF_PAGEBITS 12
465 static UInt pf_lastblock[PF_STREAMS];
466 static Int pf_seqblocks[PF_STREAMS];
469 void prefetch_clear(void)
472 for(i=0;i<PF_STREAMS;i++)
473 pf_lastblock[i] = pf_seqblocks[i] = 0;
477 * HW Prefetch emulation
478 * Start prefetching when detecting sequential access to 3 memory blocks.
479 * One stream can be detected per 4k page.
482 void prefetch_L2_doref(Addr a)
484 UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
485 UInt block = ( a >> L2.line_size_bits);
487 if (block != pf_lastblock[stream]) {
488 if (pf_seqblocks[stream] == 0) {
489 if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
490 else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
492 else if (pf_seqblocks[stream] >0) {
493 if (pf_lastblock[stream] +1 == block) {
494 pf_seqblocks[stream]++;
495 if (pf_seqblocks[stream] >= 2) {
497 cachesim_ref(&L2, a + 5 * L2.line_size,1);
500 else pf_seqblocks[stream] = 0;
502 else if (pf_seqblocks[stream] <0) {
503 if (pf_lastblock[stream] -1 == block) {
504 pf_seqblocks[stream]--;
505 if (pf_seqblocks[stream] <= -2) {
507 cachesim_ref(&L2, a - 5 * L2.line_size,1);
510 else pf_seqblocks[stream] = 0;
512 pf_lastblock[stream] = block;
516 /* simple model with hardware prefetch */
519 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
521 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
522 prefetch_L2_doref(a);
523 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
528 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
530 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
531 prefetch_L2_doref(a);
532 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
537 /* complex model with hardware prefetch */
540 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
542 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
543 prefetch_L2_doref(a);
544 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
545 case Hit: return L2_Hit;
546 case Miss: return MemAccess;
549 return WriteBackMemAccess;
553 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
555 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
556 prefetch_L2_doref(a);
557 switch( cachesim_ref_wb( &L2, Read, a, size) ) {
558 case Hit: return L2_Hit;
559 case Miss: return MemAccess;
562 return WriteBackMemAccess;
566 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
568 prefetch_L2_doref(a);
569 if ( cachesim_ref( &D1, a, size) == Hit ) {
570 /* Even for a L1 hit, the write-trough L1 passes
571 * the write to the L2 to make the L2 line dirty.
572 * But this causes no latency, so return the hit.
574 cachesim_ref_wb( &L2, Write, a, size);
577 switch( cachesim_ref_wb( &L2, Write, a, size) ) {
578 case Hit: return L2_Hit;
579 case Miss: return MemAccess;
582 return WriteBackMemAccess;
586 /*------------------------------------------------------------*/
587 /*--- Cache Simulation with use metric collection ---*/
588 /*------------------------------------------------------------*/
590 /* can not be combined with write-back or prefetch */
593 void cacheuse_initcache(cache_t2* c)
596 unsigned int start_mask, start_val;
597 unsigned int end_mask, end_val;
599 c->use = CLG_MALLOC("cl.sim.cu_ic.1",
600 sizeof(line_use) * c->sets * c->assoc);
601 c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
602 sizeof(line_loaded) * c->sets * c->assoc);
603 c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
604 sizeof(int) * c->line_size);
605 c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
606 sizeof(int) * c->line_size);
608 c->line_size_mask = c->line_size-1;
610 /* Meaning of line_start_mask/line_end_mask
611 * Example: for a given cache line, you get an access starting at
612 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
613 * line size of 32, you have 1 bit per byte in the mask:
615 * bit31 bit8 bit5 bit 0
617 * 11..111111100000 line_start_mask[5]
618 * 00..000111111111 line_end_mask[(5+4)-1]
620 * use_mask |= line_start_mask[5] && line_end_mask[8]
623 start_val = end_val = ~0;
624 if (c->line_size < 32) {
625 int bits_per_byte = 32/c->line_size;
626 start_mask = (1<<bits_per_byte)-1;
627 end_mask = start_mask << (32-bits_per_byte);
628 for(i=0;i<c->line_size;i++) {
629 c->line_start_mask[i] = start_val;
630 start_val = start_val & ~start_mask;
631 start_mask = start_mask << bits_per_byte;
633 c->line_end_mask[c->line_size-i-1] = end_val;
634 end_val = end_val & ~end_mask;
635 end_mask = end_mask >> bits_per_byte;
639 int bytes_per_bit = c->line_size/32;
642 for(i=0;i<c->line_size;i++) {
643 c->line_start_mask[i] = start_val;
644 c->line_end_mask[c->line_size-i-1] = end_val;
645 if ( ((i+1)%bytes_per_bit) == 0) {
646 start_val &= ~start_mask;
647 end_val &= ~end_mask;
654 CLG_DEBUG(6, "Config %s:\n", c->desc_line);
655 for(i=0;i<c->line_size;i++) {
656 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
657 i, c->line_start_mask[i], c->line_end_mask[i]);
660 /* We use lower tag bits as offset pointers to cache use info.
661 * I.e. some cache parameters don't work.
663 if ( (1<<c->tag_shift) < c->assoc) {
664 VG_(message)(Vg_DebugMsg,
665 "error: Use associativity < %d for cache use statistics!\n",
667 VG_(tool_panic)("Unsupported cache configuration");
672 /* for I1/D1 caches */
673 #define CACHEUSE(L) \
675 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
677 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
678 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
679 UWord tag = a & L.tag_mask; \
682 UWord *set, tmp_tag; \
685 CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n", \
686 L.name, a, size, set1, set2); \
688 /* First case: word entirely within line. */ \
689 if (set1 == set2) { \
691 set = &(L.tags[set1 * L.assoc]); \
692 use_mask = L.line_start_mask[a & L.line_size_mask] & \
693 L.line_end_mask[(a+size-1) & L.line_size_mask]; \
695 /* This loop is unrolled for just the first case, which is the most */\
696 /* common. We can't unroll any further because it would screw up */\
697 /* if we have a direct-mapped (1-way) cache. */\
698 if (tag == (set[0] & L.tag_mask)) { \
699 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
700 L.use[idx].count ++; \
701 L.use[idx].mask |= use_mask; \
702 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
703 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
704 use_mask, L.use[idx].mask, L.use[idx].count); \
707 /* If the tag is one other than the MRU, move it into the MRU spot */\
708 /* and shuffle the rest down. */\
709 for (i = 1; i < L.assoc; i++) { \
710 if (tag == (set[i] & L.tag_mask)) { \
712 for (j = i; j > 0; j--) { \
713 set[j] = set[j - 1]; \
716 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
717 L.use[idx].count ++; \
718 L.use[idx].mask |= use_mask; \
719 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
720 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
721 use_mask, L.use[idx].mask, L.use[idx].count); \
726 /* A miss; install this tag as MRU, shuffle rest down. */ \
727 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
728 for (j = L.assoc - 1; j > 0; j--) { \
729 set[j] = set[j - 1]; \
731 set[0] = tag | tmp_tag; \
732 idx = (set1 * L.assoc) + tmp_tag; \
733 return update_##L##_use(&L, idx, \
734 use_mask, a &~ L.line_size_mask); \
736 /* Second case: word straddles two lines. */ \
737 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
738 } else if (((set1 + 1) & (L.sets-1)) == set2) { \
739 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
740 set = &(L.tags[set1 * L.assoc]); \
741 use_mask = L.line_start_mask[a & L.line_size_mask]; \
742 if (tag == (set[0] & L.tag_mask)) { \
743 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
744 L.use[idx].count ++; \
745 L.use[idx].mask |= use_mask; \
746 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
747 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
748 use_mask, L.use[idx].mask, L.use[idx].count); \
751 for (i = 1; i < L.assoc; i++) { \
752 if (tag == (set[i] & L.tag_mask)) { \
754 for (j = i; j > 0; j--) { \
755 set[j] = set[j - 1]; \
758 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
759 L.use[idx].count ++; \
760 L.use[idx].mask |= use_mask; \
761 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
762 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
763 use_mask, L.use[idx].mask, L.use[idx].count); \
767 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
768 for (j = L.assoc - 1; j > 0; j--) { \
769 set[j] = set[j - 1]; \
771 set[0] = tag | tmp_tag; \
772 idx = (set1 * L.assoc) + tmp_tag; \
773 miss1 = update_##L##_use(&L, idx, \
774 use_mask, a &~ L.line_size_mask); \
776 set = &(L.tags[set2 * L.assoc]); \
777 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
778 tag2 = (a+size-1) & L.tag_mask; \
779 if (tag2 == (set[0] & L.tag_mask)) { \
780 idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \
781 L.use[idx].count ++; \
782 L.use[idx].mask |= use_mask; \
783 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
784 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
785 use_mask, L.use[idx].mask, L.use[idx].count); \
788 for (i = 1; i < L.assoc; i++) { \
789 if (tag2 == (set[i] & L.tag_mask)) { \
791 for (j = i; j > 0; j--) { \
792 set[j] = set[j - 1]; \
795 idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \
796 L.use[idx].count ++; \
797 L.use[idx].mask |= use_mask; \
798 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
799 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
800 use_mask, L.use[idx].mask, L.use[idx].count); \
804 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
805 for (j = L.assoc - 1; j > 0; j--) { \
806 set[j] = set[j - 1]; \
808 set[0] = tag2 | tmp_tag; \
809 idx = (set2 * L.assoc) + tmp_tag; \
810 miss2 = update_##L##_use(&L, idx, \
811 use_mask, (a+size-1) &~ L.line_size_mask); \
812 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
815 VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \
816 VG_(tool_panic)("item straddles more than two cache sets"); \
822 /* logarithmic bitcounting algorithm, see
823 * http://graphics.stanford.edu/~seander/bithacks.html
825 static __inline__ unsigned int countBits(unsigned int bits)
827 unsigned int c; // store the total here
828 const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
829 const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
832 c = ((c >> S[0]) & B[0]) + (c & B[0]);
833 c = ((c >> S[1]) & B[1]) + (c & B[1]);
834 c = ((c >> S[2]) & B[2]) + (c & B[2]);
835 c = ((c >> S[3]) & B[3]) + (c & B[3]);
836 c = ((c >> S[4]) & B[4]) + (c & B[4]);
840 static void update_L2_use(int idx, Addr memline)
842 line_loaded* loaded = &(L2.loaded[idx]);
843 line_use* use = &(L2.use[idx]);
844 int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
846 CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
847 idx, CLG_(bb_base) + current_ii->instr_offset, memline);
849 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
850 use->count, i, use->mask, loaded->memline, loaded->iaddr);
851 CLG_DEBUG(2, " collect: %d, use_base %p\n",
852 CLG_(current_state).collect, loaded->use_base);
854 if (CLG_(current_state).collect && loaded->use_base) {
855 (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
856 (loaded->use_base)[off_L2_SpLoss] += i;
863 loaded->memline = memline;
864 loaded->iaddr = CLG_(bb_base) + current_ii->instr_offset;
865 loaded->use_base = (CLG_(current_state).nonskipped) ?
866 CLG_(current_state).nonskipped->skipped :
867 CLG_(cost_base) + current_ii->cost_offset;
871 CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
873 UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
874 UWord* set = &(L2.tags[setNo * L2.assoc]);
875 UWord tag = memline & L2.tag_mask;
880 CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
882 if (tag == (set[0] & L2.tag_mask)) {
883 idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
884 l1_loaded->dep_use = &(L2.use[idx]);
886 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
887 idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
888 L2.use[idx].mask, L2.use[idx].count);
891 for (i = 1; i < L2.assoc; i++) {
892 if (tag == (set[i] & L2.tag_mask)) {
894 for (j = i; j > 0; j--) {
898 idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
899 l1_loaded->dep_use = &(L2.use[idx]);
901 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
902 i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
903 L2.use[idx].mask, L2.use[idx].count);
908 /* A miss; install this tag as MRU, shuffle rest down. */
909 tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
910 for (j = L2.assoc - 1; j > 0; j--) {
913 set[0] = tag | tmp_tag;
914 idx = (setNo * L2.assoc) + tmp_tag;
915 l1_loaded->dep_use = &(L2.use[idx]);
917 update_L2_use(idx, memline);
925 #define UPDATE_USE(L) \
927 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
928 UInt mask, Addr memline) \
930 line_loaded* loaded = &(cache->loaded[idx]); \
931 line_use* use = &(cache->use[idx]); \
932 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
934 CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
935 cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
936 if (use->count>0) { \
937 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
938 use->count, c, use->mask, loaded->memline, loaded->iaddr); \
939 CLG_DEBUG(2, " collect: %d, use_base %p\n", \
940 CLG_(current_state).collect, loaded->use_base); \
942 if (CLG_(current_state).collect && loaded->use_base) { \
943 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
944 (loaded->use_base)[off_##L##_SpLoss] += c; \
946 /* FIXME (?): L1/L2 line sizes must be equal ! */ \
947 loaded->dep_use->mask |= use->mask; \
948 loaded->dep_use->count += use->count; \
954 loaded->memline = memline; \
955 loaded->iaddr = CLG_(bb_base) + current_ii->instr_offset; \
956 loaded->use_base = (CLG_(current_state).nonskipped) ? \
957 CLG_(current_state).nonskipped->skipped : \
958 CLG_(cost_base) + current_ii->cost_offset; \
960 if (memline == 0) return L2_Hit; \
961 return cacheuse_L2_access(memline, loaded); \
972 void cacheuse_finish(void)
975 InstrInfo ii = { 0,0,0,0 };
977 if (!CLG_(current_state).collect) return;
983 /* update usage counters */
985 for (i = 0; i < I1.sets * I1.assoc; i++)
986 if (I1.loaded[i].use_base)
987 update_I1_use( &I1, i, 0,0);
990 for (i = 0; i < D1.sets * D1.assoc; i++)
991 if (D1.loaded[i].use_base)
992 update_D1_use( &D1, i, 0,0);
995 for (i = 0; i < L2.sets * L2.assoc; i++)
996 if (L2.loaded[i].use_base)
1002 /*------------------------------------------------------------*/
1003 /*--- Helper functions called by instrumented code ---*/
1004 /*------------------------------------------------------------*/
1008 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1011 case WriteBackMemAccess:
1012 if (clo_simulate_writeback) {
1035 Char* cacheRes(CacheModelResult r)
1038 case L1_Hit: return "L1 Hit ";
1039 case L2_Hit: return "L2 Hit ";
1040 case MemAccess: return "L2 Miss";
1041 case WriteBackMemAccess: return "L2 Miss (dirty)";
1049 static void log_1I0D(InstrInfo* ii)
1051 CacheModelResult IrRes;
1054 IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1056 CLG_DEBUG(6, "log_1I0D: Ir %#lx/%u => %s\n",
1057 CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
1059 if (CLG_(current_state).collect) {
1062 if (CLG_(current_state).nonskipped)
1063 cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1065 cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1067 inc_costs(IrRes, cost_Ir,
1068 CLG_(current_state).cost + fullOffset(EG_IR) );
1073 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
1075 CacheModelResult Ir1Res, Ir2Res;
1076 ULong *global_cost_Ir;
1079 Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1081 Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1083 CLG_DEBUG(6, "log_2I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1084 CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1085 CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
1087 if (!CLG_(current_state).collect) return;
1089 global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1090 if (CLG_(current_state).nonskipped) {
1091 ULong* skipped_cost_Ir =
1092 CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1094 inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1095 inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1099 inc_costs(Ir1Res, global_cost_Ir,
1100 CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1101 inc_costs(Ir2Res, global_cost_Ir,
1102 CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1106 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
1108 CacheModelResult Ir1Res, Ir2Res, Ir3Res;
1109 ULong *global_cost_Ir;
1112 Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1114 Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1116 Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
1118 CLG_DEBUG(6, "log_3I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1119 CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1120 CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
1121 CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
1123 if (!CLG_(current_state).collect) return;
1125 global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1126 if (CLG_(current_state).nonskipped) {
1127 ULong* skipped_cost_Ir =
1128 CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1129 inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1130 inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1131 inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
1135 inc_costs(Ir1Res, global_cost_Ir,
1136 CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1137 inc_costs(Ir2Res, global_cost_Ir,
1138 CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1139 inc_costs(Ir3Res, global_cost_Ir,
1140 CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
1143 /* Instruction doing a read access */
1146 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1148 CacheModelResult IrRes, DrRes;
1151 IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1152 DrRes = (*simulator.D1_Read)(data_addr, data_size);
1154 CLG_DEBUG(6, "log_1I1Dr: Ir %#lx/%u => %s, Dr %#lx/%lu => %s\n",
1155 CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1156 data_addr, data_size, cacheRes(DrRes));
1158 if (CLG_(current_state).collect) {
1159 ULong *cost_Ir, *cost_Dr;
1161 if (CLG_(current_state).nonskipped) {
1162 cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1163 cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1166 cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1167 cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1170 inc_costs(IrRes, cost_Ir,
1171 CLG_(current_state).cost + fullOffset(EG_IR) );
1172 inc_costs(DrRes, cost_Dr,
1173 CLG_(current_state).cost + fullOffset(EG_DR) );
1179 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1181 CacheModelResult DrRes;
1184 DrRes = (*simulator.D1_Read)(data_addr, data_size);
1186 CLG_DEBUG(6, "log_0I1Dr: Dr %#lx/%lu => %s\n",
1187 data_addr, data_size, cacheRes(DrRes));
1189 if (CLG_(current_state).collect) {
1192 if (CLG_(current_state).nonskipped)
1193 cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1195 cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1197 inc_costs(DrRes, cost_Dr,
1198 CLG_(current_state).cost + fullOffset(EG_DR) );
1203 /* Instruction doing a write access */
1206 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1208 CacheModelResult IrRes, DwRes;
1211 IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1212 DwRes = (*simulator.D1_Write)(data_addr, data_size);
1214 CLG_DEBUG(6, "log_1I1Dw: Ir %#lx/%u => %s, Dw %#lx/%lu => %s\n",
1215 CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1216 data_addr, data_size, cacheRes(DwRes));
1218 if (CLG_(current_state).collect) {
1219 ULong *cost_Ir, *cost_Dw;
1221 if (CLG_(current_state).nonskipped) {
1222 cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1223 cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1226 cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1227 cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1230 inc_costs(IrRes, cost_Ir,
1231 CLG_(current_state).cost + fullOffset(EG_IR) );
1232 inc_costs(DwRes, cost_Dw,
1233 CLG_(current_state).cost + fullOffset(EG_DW) );
1238 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1240 CacheModelResult DwRes;
1243 DwRes = (*simulator.D1_Write)(data_addr, data_size);
1245 CLG_DEBUG(6, "log_0I1Dw: Dw %#lx/%lu => %s\n",
1246 data_addr, data_size, cacheRes(DwRes));
1248 if (CLG_(current_state).collect) {
1251 if (CLG_(current_state).nonskipped)
1252 cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1254 cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1256 inc_costs(DwRes, cost_Dw,
1257 CLG_(current_state).cost + fullOffset(EG_DW) );
1263 /*------------------------------------------------------------*/
1264 /*--- Cache configuration ---*/
1265 /*------------------------------------------------------------*/
1267 #define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 })
1269 static cache_t clo_I1_cache = UNDEFINED_CACHE;
1270 static cache_t clo_D1_cache = UNDEFINED_CACHE;
1271 static cache_t clo_L2_cache = UNDEFINED_CACHE;
1274 // Checks cache config is ok. Returns NULL if ok, or a pointer to an error
1275 // string otherwise.
1276 static Char* check_cache(cache_t* cache)
1278 // Simulator requires line size and set count to be powers of two.
1279 if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
1280 (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
1282 return "Cache set count is not a power of two.\n";
1285 // Simulator requires line size to be a power of two.
1286 if (-1 == VG_(log2)(cache->line_size)) {
1287 return "Cache line size is not a power of two.\n";
1290 // Then check line size >= 16 -- any smaller and a single instruction could
1291 // straddle three cache lines, which breaks a simulation assertion and is
1293 if (cache->line_size < MIN_LINE_SIZE) {
1294 return "Cache line size is too small.\n";
1297 /* Then check cache size > line size (causes seg faults if not). */
1298 if (cache->size <= cache->line_size) {
1299 return "Cache size <= line size.\n";
1302 /* Then check assoc <= (size / line size) (seg faults otherwise). */
1303 if (cache->assoc > (cache->size / cache->line_size)) {
1304 return "Cache associativity > (size / line size).\n";
1311 void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
1313 #define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
1317 Bool all_caches_clo_defined =
1318 (DEFINED(clo_I1_cache) &&
1319 DEFINED(clo_D1_cache) &&
1320 DEFINED(clo_L2_cache));
1322 // Set the cache config (using auto-detection, if supported by the
1324 VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
1326 // Check the default/auto-detected values.
1327 checkRes = check_cache(I1c); tl_assert(!checkRes);
1328 checkRes = check_cache(D1c); tl_assert(!checkRes);
1329 checkRes = check_cache(L2c); tl_assert(!checkRes);
1331 // Then replace with any defined on the command line.
1332 if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1333 if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1334 if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
1336 if (VG_(clo_verbosity) > 1) {
1337 VG_(message)(Vg_UserMsg, "Cache configuration used:\n");
1338 VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines\n",
1339 I1c->size, I1c->assoc, I1c->line_size);
1340 VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines\n",
1341 D1c->size, D1c->assoc, D1c->line_size);
1342 VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines\n",
1343 L2c->size, L2c->assoc, L2c->line_size);
1345 #undef CMD_LINE_DEFINED
1349 /* Initialize and clear simulator state */
1350 static void cachesim_post_clo_init(void)
1352 /* Cache configurations. */
1353 cache_t I1c, D1c, L2c;
1355 /* Initialize access handlers */
1356 if (!CLG_(clo).simulate_cache) {
1357 CLG_(cachesim).log_1I0D = 0;
1358 CLG_(cachesim).log_1I0D_name = "(no function)";
1359 CLG_(cachesim).log_2I0D = 0;
1360 CLG_(cachesim).log_2I0D_name = "(no function)";
1361 CLG_(cachesim).log_3I0D = 0;
1362 CLG_(cachesim).log_3I0D_name = "(no function)";
1364 CLG_(cachesim).log_1I1Dr = 0;
1365 CLG_(cachesim).log_1I1Dr_name = "(no function)";
1366 CLG_(cachesim).log_1I1Dw = 0;
1367 CLG_(cachesim).log_1I1Dw_name = "(no function)";
1369 CLG_(cachesim).log_0I1Dr = 0;
1370 CLG_(cachesim).log_0I1Dr_name = "(no function)";
1371 CLG_(cachesim).log_0I1Dw = 0;
1372 CLG_(cachesim).log_0I1Dw_name = "(no function)";
1376 /* Configuration of caches only needed with real cache simulation */
1377 configure_caches(&I1c, &D1c, &L2c);
1383 cachesim_initcache(I1c, &I1);
1384 cachesim_initcache(D1c, &D1);
1385 cachesim_initcache(L2c, &L2);
1387 /* the other cache simulators use the standard helpers
1388 * with dispatching via simulator struct */
1390 CLG_(cachesim).log_1I0D = log_1I0D;
1391 CLG_(cachesim).log_1I0D_name = "log_1I0D";
1392 CLG_(cachesim).log_2I0D = log_2I0D;
1393 CLG_(cachesim).log_2I0D_name = "log_2I0D";
1394 CLG_(cachesim).log_3I0D = log_3I0D;
1395 CLG_(cachesim).log_3I0D_name = "log_3I0D";
1397 CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1398 CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1399 CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1400 CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1402 CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1403 CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1404 CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1405 CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1407 if (clo_collect_cacheuse) {
1409 /* Output warning for not supported option combinations */
1410 if (clo_simulate_hwpref) {
1411 VG_(message)(Vg_DebugMsg,
1412 "warning: prefetch simulation can not be "
1413 "used with cache usage\n");
1414 clo_simulate_hwpref = False;
1417 if (clo_simulate_writeback) {
1418 VG_(message)(Vg_DebugMsg,
1419 "warning: write-back simulation can not be "
1420 "used with cache usage\n");
1421 clo_simulate_writeback = False;
1424 simulator.I1_Read = cacheuse_I1_doRead;
1425 simulator.D1_Read = cacheuse_D1_doRead;
1426 simulator.D1_Write = cacheuse_D1_doRead;
1430 if (clo_simulate_hwpref) {
1433 if (clo_simulate_writeback) {
1434 simulator.I1_Read = prefetch_I1_Read;
1435 simulator.D1_Read = prefetch_D1_Read;
1436 simulator.D1_Write = prefetch_D1_Write;
1439 simulator.I1_Read = prefetch_I1_ref;
1440 simulator.D1_Read = prefetch_D1_ref;
1441 simulator.D1_Write = prefetch_D1_ref;
1447 if (clo_simulate_writeback) {
1448 simulator.I1_Read = cachesim_I1_Read;
1449 simulator.D1_Read = cachesim_D1_Read;
1450 simulator.D1_Write = cachesim_D1_Write;
1453 simulator.I1_Read = cachesim_I1_ref;
1454 simulator.D1_Read = cachesim_D1_ref;
1455 simulator.D1_Write = cachesim_D1_ref;
1460 /* Clear simulator state. Has to be initialized before */
1462 void cachesim_clear(void)
1464 cachesim_clearcache(&I1);
1465 cachesim_clearcache(&D1);
1466 cachesim_clearcache(&L2);
1472 static void cachesim_getdesc(Char* buf)
1475 p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1476 p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1477 VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
1481 void cachesim_print_opts(void)
1484 "\n cache simulator options (does cache simulation if used):\n"
1485 " --simulate-wb=no|yes Count write-back events [no]\n"
1486 " --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1487 #if CLG_EXPERIMENTAL
1488 " --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1490 " --cacheuse=no|yes Collect cache block use [no]\n"
1491 " --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
1492 " --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
1493 " --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
1497 static void parse_opt ( cache_t* cache, char* opt, Char* optval )
1503 // Option argument looks like "65536,2,64". Extract them.
1504 i1 = VG_(strtoll10)(optval, &endptr); if (*endptr != ',') goto bad;
1505 i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',') goto bad;
1506 i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
1508 // Check for overflow.
1509 cache->size = (Int)i1;
1510 cache->assoc = (Int)i2;
1511 cache->line_size = (Int)i3;
1512 if (cache->size != i1) goto overflow;
1513 if (cache->assoc != i2) goto overflow;
1514 if (cache->line_size != i3) goto overflow;
1516 checkRes = check_cache(cache);
1518 VG_(fmsg)("%s", checkRes);
1525 VG_(fmsg_bad_option)(opt, "");
1528 VG_(fmsg_bad_option)(opt,
1529 "One of the cache parameters was too large and overflowed.\n");
1532 /* Check for command line option for cache configuration.
1533 * Return False if unknown and not handled.
1535 * Called from CLG_(process_cmd_line_option)() in clo.c
1537 static Bool cachesim_parse_opt(Char* arg)
1541 if VG_BOOL_CLO(arg, "--simulate-wb", clo_simulate_writeback) {}
1542 else if VG_BOOL_CLO(arg, "--simulate-hwpref", clo_simulate_hwpref) {}
1543 else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors) {}
1545 else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1546 if (clo_collect_cacheuse) {
1547 /* Use counters only make sense with fine dumping */
1548 CLG_(clo).dump_instr = True;
1552 else if VG_STR_CLO(arg, "--I1", tmp_str)
1553 parse_opt(&clo_I1_cache, arg, tmp_str);
1554 else if VG_STR_CLO(arg, "--D1", tmp_str)
1555 parse_opt(&clo_D1_cache, arg, tmp_str);
1556 else if VG_STR_CLO(arg, "--L2", tmp_str)
1557 parse_opt(&clo_L2_cache, arg, tmp_str);
1564 /* Adds commas to ULong, right justifying in a field field_width wide, returns
1565 * the string in buf. */
1567 Int commify(ULong n, int field_width, char* buf)
1569 int len, n_commas, i, j, new_len, space;
1571 VG_(sprintf)(buf, "%llu", n);
1572 len = VG_(strlen)(buf);
1573 n_commas = (len - 1) / 3;
1574 new_len = len + n_commas;
1575 space = field_width - new_len;
1577 /* Allow for printing a number in a field_width smaller than it's size */
1578 if (space < 0) space = 0;
1580 /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1582 for (j = -1, i = len ; i >= 0; i--) {
1583 buf[i + n_commas + space] = buf[i];
1585 if ((i>0) && (3 == ++j)) {
1588 buf[i + n_commas + space] = ',';
1591 /* Right justify in field. */
1592 for (i = 0; i < space; i++) buf[i] = ' ';
1597 void percentify(Int n, Int ex, Int field_width, char buf[])
1601 VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1602 len = VG_(strlen)(buf);
1603 space = field_width - len;
1604 if (space < 0) space = 0; /* Allow for v. small field_width */
1607 /* Right justify in field */
1608 for ( ; i >= 0; i--) buf[i + space] = buf[i];
1609 for (i = 0; i < space; i++) buf[i] = ' ';
1613 void cachesim_printstat(Int l1, Int l2, Int l3)
1615 FullCost total = CLG_(total_cost), D_total = 0;
1616 ULong L2_total_m, L2_total_mr, L2_total_mw,
1617 L2_total, L2_total_r, L2_total_w;
1618 char buf1[RESULTS_BUF_LEN],
1619 buf2[RESULTS_BUF_LEN],
1620 buf3[RESULTS_BUF_LEN];
1623 if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1624 VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu\n",
1626 VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu\n",
1628 VG_(message)(Vg_DebugMsg, "\n");
1631 commify(total[fullOffset(EG_IR) +1], l1, buf1);
1632 VG_(message)(Vg_UserMsg, "I1 misses: %s\n", buf1);
1634 commify(total[fullOffset(EG_IR) +2], l1, buf1);
1635 VG_(message)(Vg_UserMsg, "L2i misses: %s\n", buf1);
1639 if (0 == total[fullOffset(EG_IR)])
1640 total[fullOffset(EG_IR)] = 1;
1642 percentify(total[fullOffset(EG_IR)+1] * 100 * p /
1643 total[fullOffset(EG_IR)], p, l1+1, buf1);
1644 VG_(message)(Vg_UserMsg, "I1 miss rate: %s\n", buf1);
1646 percentify(total[fullOffset(EG_IR)+2] * 100 * p /
1647 total[fullOffset(EG_IR)], p, l1+1, buf1);
1648 VG_(message)(Vg_UserMsg, "L2i miss rate: %s\n", buf1);
1649 VG_(message)(Vg_UserMsg, "\n");
1652 Use the D_refs.rd and D_refs.wr values to determine the
1653 * width of columns 2 & 3. */
1655 D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1656 CLG_(init_cost)( CLG_(sets).full, D_total);
1657 // we only use the first 3 values of D_total, adding up Dr and Dw costs
1658 CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
1659 CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
1661 commify( D_total[0], l1, buf1);
1662 commify(total[fullOffset(EG_DR)], l2, buf2);
1663 commify(total[fullOffset(EG_DW)], l3, buf3);
1664 VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)\n",
1667 commify( D_total[1], l1, buf1);
1668 commify(total[fullOffset(EG_DR)+1], l2, buf2);
1669 commify(total[fullOffset(EG_DW)+1], l3, buf3);
1670 VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)\n",
1673 commify( D_total[2], l1, buf1);
1674 commify(total[fullOffset(EG_DR)+2], l2, buf2);
1675 commify(total[fullOffset(EG_DW)+2], l3, buf3);
1676 VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)\n",
1681 if (0 == D_total[0]) D_total[0] = 1;
1682 if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
1683 if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
1685 percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1);
1686 percentify(total[fullOffset(EG_DR)+1] * 100 * p /
1687 total[fullOffset(EG_DR)], p, l2+1, buf2);
1688 percentify(total[fullOffset(EG_DW)+1] * 100 * p /
1689 total[fullOffset(EG_DW)], p, l3+1, buf3);
1690 VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )\n",
1693 percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1);
1694 percentify(total[fullOffset(EG_DR)+2] * 100 * p /
1695 total[fullOffset(EG_DR)], p, l2+1, buf2);
1696 percentify(total[fullOffset(EG_DW)+2] * 100 * p /
1697 total[fullOffset(EG_DW)], p, l3+1, buf3);
1698 VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )\n",
1700 VG_(message)(Vg_UserMsg, "\n");
1704 /* L2 overall results */
1707 total[fullOffset(EG_DR) +1] +
1708 total[fullOffset(EG_DW) +1] +
1709 total[fullOffset(EG_IR) +1];
1711 total[fullOffset(EG_DR) +1] +
1712 total[fullOffset(EG_IR) +1];
1713 L2_total_w = total[fullOffset(EG_DW) +1];
1714 commify(L2_total, l1, buf1);
1715 commify(L2_total_r, l2, buf2);
1716 commify(L2_total_w, l3, buf3);
1717 VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)\n",
1721 total[fullOffset(EG_DR) +2] +
1722 total[fullOffset(EG_DW) +2] +
1723 total[fullOffset(EG_IR) +2];
1725 total[fullOffset(EG_DR) +2] +
1726 total[fullOffset(EG_IR) +2];
1727 L2_total_mw = total[fullOffset(EG_DW) +2];
1728 commify(L2_total_m, l1, buf1);
1729 commify(L2_total_mr, l2, buf2);
1730 commify(L2_total_mw, l3, buf3);
1731 VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)\n",
1734 percentify(L2_total_m * 100 * p /
1735 (total[fullOffset(EG_IR)] + D_total[0]), p, l1+1, buf1);
1736 percentify(L2_total_mr * 100 * p /
1737 (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
1739 percentify(L2_total_mw * 100 * p /
1740 total[fullOffset(EG_DW)], p, l3+1, buf3);
1741 VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )\n",
1746 /*------------------------------------------------------------*/
1747 /*--- Setup for Event set. ---*/
1748 /*------------------------------------------------------------*/
1750 struct event_sets CLG_(sets);
1752 void CLG_(init_eventsets)()
1754 // Event groups from which the event sets are composed
1755 // the "Use" group only is used with "cacheuse" simulation
1756 if (clo_collect_cacheuse)
1757 CLG_(register_event_group4)(EG_USE,
1758 "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1760 if (!CLG_(clo).simulate_cache)
1761 CLG_(register_event_group)(EG_IR, "Ir");
1762 else if (!clo_simulate_writeback) {
1763 CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "I2mr");
1764 CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "D2mr");
1765 CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "D2mw");
1767 else { // clo_simulate_writeback
1768 CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "I2mr", "I2dmr");
1769 CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "D2mr", "D2dmr");
1770 CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "D2mw", "D2dmw");
1773 if (CLG_(clo).simulate_branch) {
1774 CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
1775 CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
1778 if (CLG_(clo).collect_bus)
1779 CLG_(register_event_group)(EG_BUS, "Ge");
1781 if (CLG_(clo).collect_alloc)
1782 CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
1784 if (CLG_(clo).collect_systime)
1785 CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
1787 // event set used as base for instruction self cost
1788 CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
1790 // event set comprising all event groups, used for inclusive cost
1791 CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
1792 CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
1793 CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
1794 CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
1797 CLG_DEBUG(1, "EventSets:\n");
1798 CLG_(print_eventset)(-2, CLG_(sets).base);
1799 CLG_(print_eventset)(-2, CLG_(sets).full);
1802 /* Not-existing events are silently ignored */
1803 CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
1804 CLG_(append_event)(CLG_(dumpmap), "Ir");
1805 CLG_(append_event)(CLG_(dumpmap), "Dr");
1806 CLG_(append_event)(CLG_(dumpmap), "Dw");
1807 CLG_(append_event)(CLG_(dumpmap), "I1mr");
1808 CLG_(append_event)(CLG_(dumpmap), "D1mr");
1809 CLG_(append_event)(CLG_(dumpmap), "D1mw");
1810 CLG_(append_event)(CLG_(dumpmap), "I2mr");
1811 CLG_(append_event)(CLG_(dumpmap), "D2mr");
1812 CLG_(append_event)(CLG_(dumpmap), "D2mw");
1813 CLG_(append_event)(CLG_(dumpmap), "I2dmr");
1814 CLG_(append_event)(CLG_(dumpmap), "D2dmr");
1815 CLG_(append_event)(CLG_(dumpmap), "D2dmw");
1816 CLG_(append_event)(CLG_(dumpmap), "Bc");
1817 CLG_(append_event)(CLG_(dumpmap), "Bcm");
1818 CLG_(append_event)(CLG_(dumpmap), "Bi");
1819 CLG_(append_event)(CLG_(dumpmap), "Bim");
1820 CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1821 CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1822 CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1823 CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1824 CLG_(append_event)(CLG_(dumpmap), "Ge");
1825 CLG_(append_event)(CLG_(dumpmap), "allocCount");
1826 CLG_(append_event)(CLG_(dumpmap), "allocSize");
1827 CLG_(append_event)(CLG_(dumpmap), "sysCount");
1828 CLG_(append_event)(CLG_(dumpmap), "sysTime");
1832 /* this is called at dump time for every instruction executed */
1833 static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1834 InstrInfo* ii, ULong exe_count)
1836 if (!CLG_(clo).simulate_cache)
1837 cost[ fullOffset(EG_IR) ] += exe_count;
1840 CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
1841 ii->eventset, bbcc->cost + ii->cost_offset);
1845 void cachesim_finish(void)
1847 if (clo_collect_cacheuse)
1851 /*------------------------------------------------------------*/
1852 /*--- The simulator defined in this file ---*/
1853 /*------------------------------------------------------------*/
1855 struct cachesim_if CLG_(cachesim) = {
1856 .print_opts = cachesim_print_opts,
1857 .parse_opt = cachesim_parse_opt,
1858 .post_clo_init = cachesim_post_clo_init,
1859 .clear = cachesim_clear,
1860 .getdesc = cachesim_getdesc,
1861 .printstat = cachesim_printstat,
1862 .add_icost = cachesim_add_icost,
1863 .finish = cachesim_finish,
1865 /* these will be set by cachesim_post_clo_init */
1876 .log_1I0D_name = "(no function)",
1877 .log_2I0D_name = "(no function)",
1878 .log_3I0D_name = "(no function)",
1880 .log_1I1Dr_name = "(no function)",
1881 .log_1I1Dw_name = "(no function)",
1883 .log_0I1Dr_name = "(no function)",
1884 .log_0I1Dw_name = "(no function)",
1888 /*--------------------------------------------------------------------*/
1889 /*--- end ct_sim.c ---*/
1890 /*--------------------------------------------------------------------*/