l4/pkg/valgrind/src/valgrind-3.6.0-svn/callgrind/sim.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- Cache simulation.                                            ---*/
   4 /*---                                                        sim.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of Callgrind, a Valgrind tool for call graph
   9    profiling programs.
  10
  11    Copyright (C) 2003-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
  12
  13    This tool is derived from and contains code from Cachegrind
  14    Copyright (C) 2002-2010 Nicholas Nethercote (njn@valgrind.org)
  15
  16    This program is free software; you can redistribute it and/or
  17    modify it under the terms of the GNU General Public License as
  18    published by the Free Software Foundation; either version 2 of the
  19    License, or (at your option) any later version.
  20
  21    This program is distributed in the hope that it will be useful, but
  22    WITHOUT ANY WARRANTY; without even the implied warranty of
  23    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  24    General Public License for more details.
  25
  26    You should have received a copy of the GNU General Public License
  27    along with this program; if not, write to the Free Software
  28    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  29    02111-1307, USA.
  30
  31    The GNU General Public License is contained in the file COPYING.
  32 */
  33
  34 #include "global.h"
  35
  36
  37 /* Notes:
  38   - simulates a write-allocate cache
  39   - (block --> set) hash function uses simple bit selection
  40   - handling of references straddling two cache blocks:
  41       - counts as only one cache access (not two)
  42       - both blocks hit                  --> one hit
  43       - one block hits, the other misses --> one miss
  44       - both blocks miss                 --> one miss (not two)
  45 */
  46
  47 /* Cache configuration */
  48 #include "cg_arch.h"
  49
  50 /* additional structures for cache use info, separated
  51  * according usage frequency:
  52  * - line_loaded : pointer to cost center of instruction
  53  *                 which loaded the line into cache.
  54  *                 Needed to increment counters when line is evicted.
  55  * - line_use    : updated on every access
  56  */
  57 typedef struct {
  58   UInt count;
  59   UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
  60 } line_use;
  61
  62 typedef struct {
  63   Addr memline, iaddr;
  64   line_use* dep_use; /* point to higher-level cacheblock for this memline */
  65   ULong* use_base;
  66 } line_loaded;
  67
  68 /* Cache state */
  69 typedef struct {
  70    char*        name;
  71    int          size;                   /* bytes */
  72    int          assoc;
  73    int          line_size;              /* bytes */
  74    Bool         sectored;  /* prefetch nearside cacheline on read */
  75    int          sets;
  76    int          sets_min_1;
  77    int          line_size_bits;
  78    int          tag_shift;
  79    UWord        tag_mask;
  80    char         desc_line[128];
  81    UWord*       tags;
  82
  83   /* for cache use */
  84    int          line_size_mask;
  85    int*         line_start_mask;
  86    int*         line_end_mask;
  87    line_loaded* loaded;
  88    line_use*    use;
  89 } cache_t2;
  90
  91 /*
  92  * States of flat caches in our model.
  93  * We use a 2-level hierarchy,
  94  */
  95 static cache_t2 I1, D1, L2;
  96
  97 /* Lower bits of cache tags are used as flags for a cache line */
  98 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
  99 #define CACHELINE_DIRTY    1
 100
 101
 102 /* Cache simulator Options */
 103 static Bool clo_simulate_writeback = False;
 104 static Bool clo_simulate_hwpref = False;
 105 static Bool clo_simulate_sectors = False;
 106 static Bool clo_collect_cacheuse = False;
 107
 108 /* Following global vars are setup before by
 109  *  setup_bbcc()/cachesim_after_bbsetup():
 110  *
 111  * - Addr   bb_base     (instruction start address of original BB)
 112  * - ULong* cost_base   (start of cost array for BB)
 113  * - BBCC*  nonskipped  (only != 0 when in a function not skipped)
 114  */
 115
 116 /* Offset to events in event set, used in log_* functions
 117  * <off_EventSet_BasicEventSet>: offset where basic set is found
 118  */
 119 static Int off_UIr_Ir;
 120 static Int off_UIrDr_Ir,   off_UIrDr_Dr;
 121 static Int off_UIrDrDw_Ir, off_UIrDrDw_Dr, off_UIrDrDw_Dw;
 122 static Int off_UIrDw_Ir,   off_UIrDw_Dw;
 123 static Int off_UIrDwDr_Ir, off_UIrDwDr_Dr, off_UIrDwDr_Dw;
 124
 125 static Addr   bb_base;
 126 static ULong* cost_base;
 127 static InstrInfo* current_ii;
 128
 129 /* Cache use offsets */
 130 /* The offsets are only correct because all per-instruction event sets get
 131  * the "Use" set added first !
 132  */
 133 static Int off_I1_AcCost  = 0;
 134 static Int off_I1_SpLoss  = 1;
 135 static Int off_D1_AcCost  = 0;
 136 static Int off_D1_SpLoss  = 1;
 137 static Int off_L2_AcCost  = 2;
 138 static Int off_L2_SpLoss  = 3;
 139
 140 /* Cache access types */
 141 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
 142
 143 /* Result of a reference into a flat cache */
 144 typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
 145
 146 /* Result of a reference into a hierarchical cache model */
 147 typedef enum {
 148     L1_Hit,
 149     L2_Hit,
 150     MemAccess,
 151     WriteBackMemAccess } CacheModelResult;
 152
 153 typedef CacheModelResult (*simcall_type)(Addr, UChar);
 154
 155 static struct {
 156     simcall_type I1_Read;
 157     simcall_type D1_Read;
 158     simcall_type D1_Write;
 159 } simulator;
 160
 161 /*------------------------------------------------------------*/
 162 /*--- Cache Simulator Initialization                       ---*/
 163 /*------------------------------------------------------------*/
 164
 165 static void cachesim_clearcache(cache_t2* c)
 166 {
 167   Int i;
 168
 169   for (i = 0; i < c->sets * c->assoc; i++)
 170     c->tags[i] = 0;
 171   if (c->use) {
 172     for (i = 0; i < c->sets * c->assoc; i++) {
 173       c->loaded[i].memline  = 0;
 174       c->loaded[i].use_base = 0;
 175       c->loaded[i].dep_use = 0;
 176       c->loaded[i].iaddr = 0;
 177       c->use[i].mask    = 0;
 178       c->use[i].count   = 0;
 179       c->tags[i] = i % c->assoc; /* init lower bits as pointer */
 180     }
 181   }
 182 }
 183
 184 static void cacheuse_initcache(cache_t2* c);
 185
 186 /* By this point, the size/assoc/line_size has been checked. */
 187 static void cachesim_initcache(cache_t config, cache_t2* c)
 188 {
 189    c->size      = config.size;
 190    c->assoc     = config.assoc;
 191    c->line_size = config.line_size;
 192    c->sectored  = False; // FIXME
 193
 194    c->sets           = (c->size / c->line_size) / c->assoc;
 195    c->sets_min_1     = c->sets - 1;
 196    c->line_size_bits = VG_(log2)(c->line_size);
 197    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
 198    c->tag_mask       = ~((1<<c->tag_shift)-1);
 199
 200    /* Can bits in tag entries be used for flags?
 201     * Should be always true as MIN_LINE_SIZE >= 16 */
 202    CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
 203
 204    if (c->assoc == 1) {
 205       VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
 206                    c->size, c->line_size,
 207                    c->sectored ? ", sectored":"");
 208    } else {
 209       VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
 210                    c->size, c->line_size, c->assoc,
 211                    c->sectored ? ", sectored":"");
 212    }
 213
 214    c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
 215                                  sizeof(UWord) * c->sets * c->assoc);
 216    if (clo_collect_cacheuse)
 217        cacheuse_initcache(c);
 218    else
 219      c->use = 0;
 220    cachesim_clearcache(c);
 221 }
 222
 223
 224 #if 0
 225 static void print_cache(cache_t2* c)
 226 {
 227    UInt set, way, i;
 228
 229    /* Note initialisation and update of 'i'. */
 230    for (i = 0, set = 0; set < c->sets; set++) {
 231       for (way = 0; way < c->assoc; way++, i++) {
 232          VG_(printf)("%8x ", c->tags[i]);
 233       }
 234       VG_(printf)("\n");
 235    }
 236 }
 237 #endif
 238
 239
 240 /*------------------------------------------------------------*/
 241 /*--- Write Through Cache Simulation                       ---*/
 242 /*------------------------------------------------------------*/
 243
 244 /*
 245  * Simple model: L1 & L2 Write Through
 246  * Does not distinguish among read and write references
 247  *
 248  * Simulator functions:
 249  *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 250  *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 251  */
 252
 253 static __inline__
 254 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
 255 {
 256     int i, j;
 257     UWord *set;
 258
 259     set = &(c->tags[set_no * c->assoc]);
 260
 261     /* This loop is unrolled for just the first case, which is the most */
 262     /* common.  We can't unroll any further because it would screw up   */
 263     /* if we have a direct-mapped (1-way) cache.                        */
 264     if (tag == set[0])
 265         return Hit;
 266
 267     /* If the tag is one other than the MRU, move it into the MRU spot  */
 268     /* and shuffle the rest down.                                       */
 269     for (i = 1; i < c->assoc; i++) {
 270         if (tag == set[i]) {
 271             for (j = i; j > 0; j--) {
 272                 set[j] = set[j - 1];
 273             }
 274             set[0] = tag;
 275             return Hit;
 276         }
 277     }
 278
 279     /* A miss;  install this tag as MRU, shuffle rest down. */
 280     for (j = c->assoc - 1; j > 0; j--) {
 281         set[j] = set[j - 1];
 282     }
 283     set[0] = tag;
 284
 285     return Miss;
 286 }
 287
 288 static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
 289 {
 290     UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
 291     UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
 292     UWord tag  = a >> c->tag_shift;
 293
 294     /* Access entirely within line. */
 295     if (set1 == set2)
 296         return cachesim_setref(c, set1, tag);
 297
 298     /* Access straddles two lines. */
 299     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
 300     else if (((set1 + 1) & (c->sets-1)) == set2) {
 301         UWord tag2  = (a+size-1) >> c->tag_shift;
 302
 303         /* the call updates cache structures as side effect */
 304         CacheResult res1 =  cachesim_setref(c, set1, tag);
 305         CacheResult res2 =  cachesim_setref(c, set2, tag2);
 306         return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
 307
 308    } else {
 309        VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
 310        VG_(tool_panic)("item straddles more than two cache sets");
 311    }
 312    return Hit;
 313 }
 314
 315 static
 316 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 317 {
 318     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 319     if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
 320     return MemAccess;
 321 }
 322
 323 static
 324 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 325 {
 326     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 327     if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
 328     return MemAccess;
 329 }
 330
 331
 332 /*------------------------------------------------------------*/
 333 /*--- Write Back Cache Simulation                          ---*/
 334 /*------------------------------------------------------------*/
 335
 336 /*
 337  * More complex model: L1 Write-through, L2 Write-back
 338  * This needs to distinguish among read and write references.
 339  *
 340  * Simulator functions:
 341  *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 342  *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 343  *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
 344  */
 345
 346 /*
 347  * With write-back, result can be a miss evicting a dirty line
 348  * The dirty state of a cache line is stored in Bit0 of the tag for
 349  * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
 350  * type (Read/Write), the line gets dirty on a write.
 351  */
 352 static __inline__
 353 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
 354 {
 355     int i, j;
 356     UWord *set, tmp_tag;
 357
 358     set = &(c->tags[set_no * c->assoc]);
 359
 360     /* This loop is unrolled for just the first case, which is the most */
 361     /* common.  We can't unroll any further because it would screw up   */
 362     /* if we have a direct-mapped (1-way) cache.                        */
 363     if (tag == (set[0] & ~CACHELINE_DIRTY)) {
 364         set[0] |= ref;
 365         return Hit;
 366     }
 367     /* If the tag is one other than the MRU, move it into the MRU spot  */
 368     /* and shuffle the rest down.                                       */
 369     for (i = 1; i < c->assoc; i++) {
 370         if (tag == (set[i] & ~CACHELINE_DIRTY)) {
 371             tmp_tag = set[i] | ref; // update dirty flag
 372             for (j = i; j > 0; j--) {
 373                 set[j] = set[j - 1];
 374             }
 375             set[0] = tmp_tag;
 376             return Hit;
 377         }
 378     }
 379
 380     /* A miss;  install this tag as MRU, shuffle rest down. */
 381     tmp_tag = set[c->assoc - 1];
 382     for (j = c->assoc - 1; j > 0; j--) {
 383         set[j] = set[j - 1];
 384     }
 385     set[0] = tag | ref;
 386
 387     return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
 388 }
 389
 390
 391 static __inline__
 392 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
 393 {
 394     UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
 395     UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
 396     UWord tag = a & c->tag_mask;
 397
 398     /* Access entirely within line. */
 399     if (set1 == set2)
 400         return cachesim_setref_wb(c, ref, set1, tag);
 401
 402     /* Access straddles two lines. */
 403     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
 404     else if (((set1 + 1) & (c->sets-1)) == set2) {
 405         UWord tag2  = (a+size-1) & c->tag_mask;
 406
 407         /* the call updates cache structures as side effect */
 408         CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
 409         CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
 410
 411         if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
 412         return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
 413
 414    } else {
 415        VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
 416        VG_(tool_panic)("item straddles more than two cache sets");
 417    }
 418    return Hit;
 419 }
 420
 421
 422 static
 423 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 424 {
 425     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 426     switch( cachesim_ref_wb( &L2, Read, a, size) ) {
 427         case Hit: return L2_Hit;
 428         case Miss: return MemAccess;
 429         default: break;
 430     }
 431     return WriteBackMemAccess;
 432 }
 433
 434 static
 435 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 436 {
 437     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 438     switch( cachesim_ref_wb( &L2, Read, a, size) ) {
 439         case Hit: return L2_Hit;
 440         case Miss: return MemAccess;
 441         default: break;
 442     }
 443     return WriteBackMemAccess;
 444 }
 445
 446 static
 447 CacheModelResult cachesim_D1_Write(Addr a, UChar size)
 448 {
 449     if ( cachesim_ref( &D1, a, size) == Hit ) {
 450         /* Even for a L1 hit, the write-trough L1 passes
 451          * the write to the L2 to make the L2 line dirty.
 452          * But this causes no latency, so return the hit.
 453          */
 454         cachesim_ref_wb( &L2, Write, a, size);
 455         return L1_Hit;
 456     }
 457     switch( cachesim_ref_wb( &L2, Write, a, size) ) {
 458         case Hit: return L2_Hit;
 459         case Miss: return MemAccess;
 460         default: break;
 461     }
 462     return WriteBackMemAccess;
 463 }
 464
 465
 466 /*------------------------------------------------------------*/
 467 /*--- Hardware Prefetch Simulation                         ---*/
 468 /*------------------------------------------------------------*/
 469
 470 static ULong prefetch_up = 0;
 471 static ULong prefetch_down = 0;
 472
 473 #define PF_STREAMS  8
 474 #define PF_PAGEBITS 12
 475
 476 static UInt pf_lastblock[PF_STREAMS];
 477 static Int  pf_seqblocks[PF_STREAMS];
 478
 479 static
 480 void prefetch_clear(void)
 481 {
 482   int i;
 483   for(i=0;i<PF_STREAMS;i++)
 484     pf_lastblock[i] = pf_seqblocks[i] = 0;
 485 }
 486
 487 /*
 488  * HW Prefetch emulation
 489  * Start prefetching when detecting sequential access to 3 memory blocks.
 490  * One stream can be detected per 4k page.
 491  */
 492 static __inline__
 493 void prefetch_L2_doref(Addr a)
 494 {
 495   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
 496   UInt block = ( a >> L2.line_size_bits);
 497
 498   if (block != pf_lastblock[stream]) {
 499     if (pf_seqblocks[stream] == 0) {
 500       if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
 501       else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
 502     }
 503     else if (pf_seqblocks[stream] >0) {
 504       if (pf_lastblock[stream] +1 == block) {
 505         pf_seqblocks[stream]++;
 506         if (pf_seqblocks[stream] >= 2) {
 507           prefetch_up++;
 508           cachesim_ref(&L2, a + 5 * L2.line_size,1);
 509         }
 510       }
 511       else pf_seqblocks[stream] = 0;
 512     }
 513     else if (pf_seqblocks[stream] <0) {
 514       if (pf_lastblock[stream] -1 == block) {
 515         pf_seqblocks[stream]--;
 516         if (pf_seqblocks[stream] <= -2) {
 517           prefetch_down++;
 518           cachesim_ref(&L2, a - 5 * L2.line_size,1);
 519         }
 520       }
 521       else pf_seqblocks[stream] = 0;
 522     }
 523     pf_lastblock[stream] = block;
 524   }
 525 }
 526
 527 /* simple model with hardware prefetch */
 528
 529 static
 530 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
 531 {
 532     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 533     prefetch_L2_doref(a);
 534     if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
 535     return MemAccess;
 536 }
 537
 538 static
 539 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
 540 {
 541     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 542     prefetch_L2_doref(a);
 543     if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
 544     return MemAccess;
 545 }
 546
 547
 548 /* complex model with hardware prefetch */
 549
 550 static
 551 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
 552 {
 553     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 554     prefetch_L2_doref(a);
 555     switch( cachesim_ref_wb( &L2, Read, a, size) ) {
 556         case Hit: return L2_Hit;
 557         case Miss: return MemAccess;
 558         default: break;
 559     }
 560     return WriteBackMemAccess;
 561 }
 562
 563 static
 564 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
 565 {
 566     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 567     prefetch_L2_doref(a);
 568     switch( cachesim_ref_wb( &L2, Read, a, size) ) {
 569         case Hit: return L2_Hit;
 570         case Miss: return MemAccess;
 571         default: break;
 572     }
 573     return WriteBackMemAccess;
 574 }
 575
 576 static
 577 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
 578 {
 579     prefetch_L2_doref(a);
 580     if ( cachesim_ref( &D1, a, size) == Hit ) {
 581         /* Even for a L1 hit, the write-trough L1 passes
 582          * the write to the L2 to make the L2 line dirty.
 583          * But this causes no latency, so return the hit.
 584          */
 585         cachesim_ref_wb( &L2, Write, a, size);
 586         return L1_Hit;
 587     }
 588     switch( cachesim_ref_wb( &L2, Write, a, size) ) {
 589         case Hit: return L2_Hit;
 590         case Miss: return MemAccess;
 591         default: break;
 592     }
 593     return WriteBackMemAccess;
 594 }
 595
 596
 597 /*------------------------------------------------------------*/
 598 /*--- Cache Simulation with use metric collection          ---*/
 599 /*------------------------------------------------------------*/
 600
 601 /* can not be combined with write-back or prefetch */
 602
 603 static
 604 void cacheuse_initcache(cache_t2* c)
 605 {
 606     int i;
 607     unsigned int start_mask, start_val;
 608     unsigned int end_mask, end_val;
 609
 610     c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
 611                            sizeof(line_use) * c->sets * c->assoc);
 612     c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
 613                            sizeof(line_loaded) * c->sets * c->assoc);
 614     c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
 615                                     sizeof(int) * c->line_size);
 616     c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
 617                                   sizeof(int) * c->line_size);
 618
 619     c->line_size_mask = c->line_size-1;
 620
 621     /* Meaning of line_start_mask/line_end_mask
 622      * Example: for a given cache line, you get an access starting at
 623      * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
 624      * line size of 32, you have 1 bit per byte in the mask:
 625      *
 626      *   bit31   bit8 bit5  bit 0
 627      *       |      |  |    |
 628      *       11..111111100000   line_start_mask[5]
 629      *       00..000111111111   line_end_mask[(5+4)-1]
 630      *
 631      *  use_mask |= line_start_mask[5] && line_end_mask[8]
 632      *
 633      */
 634     start_val = end_val = ~0;
 635     if (c->line_size < 32) {
 636         int bits_per_byte = 32/c->line_size;
 637         start_mask = (1<<bits_per_byte)-1;
 638         end_mask   = start_mask << (32-bits_per_byte);
 639         for(i=0;i<c->line_size;i++) {
 640             c->line_start_mask[i] = start_val;
 641             start_val  = start_val & ~start_mask;
 642             start_mask = start_mask << bits_per_byte;
 643
 644             c->line_end_mask[c->line_size-i-1] = end_val;
 645             end_val  = end_val & ~end_mask;
 646             end_mask = end_mask >> bits_per_byte;
 647         }
 648     }
 649     else {
 650         int bytes_per_bit = c->line_size/32;
 651         start_mask = 1;
 652         end_mask   = 1 << 31;
 653         for(i=0;i<c->line_size;i++) {
 654             c->line_start_mask[i] = start_val;
 655             c->line_end_mask[c->line_size-i-1] = end_val;
 656             if ( ((i+1)%bytes_per_bit) == 0) {
 657                 start_val   &= ~start_mask;
 658                 end_val     &= ~end_mask;
 659                 start_mask <<= 1;
 660                 end_mask   >>= 1;
 661             }
 662         }
 663     }
 664
 665     CLG_DEBUG(6, "Config %s:\n", c->desc_line);
 666     for(i=0;i<c->line_size;i++) {
 667         CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
 668                   i, c->line_start_mask[i], c->line_end_mask[i]);
 669     }
 670
 671     /* We use lower tag bits as offset pointers to cache use info.
 672      * I.e. some cache parameters don't work.
 673      */
 674     if ( (1<<c->tag_shift) < c->assoc) {
 675         VG_(message)(Vg_DebugMsg,
 676                      "error: Use associativity < %d for cache use statistics!\n",
 677                      (1<<c->tag_shift) );
 678         VG_(tool_panic)("Unsupported cache configuration");
 679     }
 680 }
 681
 682
 683 /* for I1/D1 caches */
 684 #define CACHEUSE(L)                                                         \
 685                                                                             \
 686 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
 687 {                                                                           \
 688    UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
 689    UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
 690    UWord tag  = a & L.tag_mask;                                             \
 691    UWord tag2;                                                              \
 692    int i, j, idx;                                                           \
 693    UWord *set, tmp_tag;                                                     \
 694    UInt use_mask;                                                           \
 695                                                                             \
 696    CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n",                  \
 697             L.name, a, size, set1, set2);                                   \
 698                                                                             \
 699    /* First case: word entirely within line. */                             \
 700    if (set1 == set2) {                                                      \
 701                                                                             \
 702       set = &(L.tags[set1 * L.assoc]);                                      \
 703       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
 704                  L.line_end_mask[(a+size-1) & L.line_size_mask];            \
 705                                                                             \
 706       /* This loop is unrolled for just the first case, which is the most */\
 707       /* common.  We can't unroll any further because it would screw up   */\
 708       /* if we have a direct-mapped (1-way) cache.                        */\
 709       if (tag == (set[0] & L.tag_mask)) {                                   \
 710         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
 711         L.use[idx].count ++;                                                \
 712         L.use[idx].mask |= use_mask;                                        \
 713         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 714                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 715                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 716         return L1_Hit;                                                      \
 717       }                                                                     \
 718       /* If the tag is one other than the MRU, move it into the MRU spot  */\
 719       /* and shuffle the rest down.                                       */\
 720       for (i = 1; i < L.assoc; i++) {                                       \
 721          if (tag == (set[i] & L.tag_mask)) {                                \
 722             tmp_tag = set[i];                                               \
 723             for (j = i; j > 0; j--) {                                       \
 724                set[j] = set[j - 1];                                         \
 725             }                                                               \
 726             set[0] = tmp_tag;                                               \
 727             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 728             L.use[idx].count ++;                                            \
 729             L.use[idx].mask |= use_mask;                                    \
 730         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 731                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 732                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 733             return L1_Hit;                                                  \
 734          }                                                                  \
 735       }                                                                     \
 736                                                                             \
 737       /* A miss;  install this tag as MRU, shuffle rest down. */            \
 738       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 739       for (j = L.assoc - 1; j > 0; j--) {                                   \
 740          set[j] = set[j - 1];                                               \
 741       }                                                                     \
 742       set[0] = tag | tmp_tag;                                               \
 743       idx = (set1 * L.assoc) + tmp_tag;                                     \
 744       return update_##L##_use(&L, idx,                                      \
 745                        use_mask, a &~ L.line_size_mask);                    \
 746                                                                             \
 747    /* Second case: word straddles two lines. */                             \
 748    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
 749    } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
 750       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */           \
 751       set = &(L.tags[set1 * L.assoc]);                                      \
 752       use_mask = L.line_start_mask[a & L.line_size_mask];                   \
 753       if (tag == (set[0] & L.tag_mask)) {                                   \
 754          idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
 755          L.use[idx].count ++;                                               \
 756          L.use[idx].mask |= use_mask;                                       \
 757         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 758                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 759                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 760          goto block2;                                                       \
 761       }                                                                     \
 762       for (i = 1; i < L.assoc; i++) {                                       \
 763          if (tag == (set[i] & L.tag_mask)) {                                \
 764             tmp_tag = set[i];                                               \
 765             for (j = i; j > 0; j--) {                                       \
 766                set[j] = set[j - 1];                                         \
 767             }                                                               \
 768             set[0] = tmp_tag;                                               \
 769             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 770             L.use[idx].count ++;                                            \
 771             L.use[idx].mask |= use_mask;                                    \
 772         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 773                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 774                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 775             goto block2;                                                    \
 776          }                                                                  \
 777       }                                                                     \
 778       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 779       for (j = L.assoc - 1; j > 0; j--) {                                   \
 780          set[j] = set[j - 1];                                               \
 781       }                                                                     \
 782       set[0] = tag | tmp_tag;                                               \
 783       idx = (set1 * L.assoc) + tmp_tag;                                     \
 784       miss1 = update_##L##_use(&L, idx,                                     \
 785                        use_mask, a &~ L.line_size_mask);                    \
 786 block2:                                                                     \
 787       set = &(L.tags[set2 * L.assoc]);                                      \
 788       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];            \
 789       tag2  = (a+size-1) & L.tag_mask;                                      \
 790       if (tag2 == (set[0] & L.tag_mask)) {                                  \
 791          idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
 792          L.use[idx].count ++;                                               \
 793          L.use[idx].mask |= use_mask;                                       \
 794         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 795                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 796                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 797          return miss1;                                                      \
 798       }                                                                     \
 799       for (i = 1; i < L.assoc; i++) {                                       \
 800          if (tag2 == (set[i] & L.tag_mask)) {                               \
 801             tmp_tag = set[i];                                               \
 802             for (j = i; j > 0; j--) {                                       \
 803                set[j] = set[j - 1];                                         \
 804             }                                                               \
 805             set[0] = tmp_tag;                                               \
 806             idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 807             L.use[idx].count ++;                                            \
 808             L.use[idx].mask |= use_mask;                                    \
 809         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 810                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 811                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 812             return miss1;                                                   \
 813          }                                                                  \
 814       }                                                                     \
 815       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 816       for (j = L.assoc - 1; j > 0; j--) {                                   \
 817          set[j] = set[j - 1];                                               \
 818       }                                                                     \
 819       set[0] = tag2 | tmp_tag;                                              \
 820       idx = (set2 * L.assoc) + tmp_tag;                                     \
 821       miss2 = update_##L##_use(&L, idx,                                     \
 822                        use_mask, (a+size-1) &~ L.line_size_mask);           \
 823       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit;     \
 824                                                                             \
 825    } else {                                                                 \
 826        VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
 827        VG_(tool_panic)("item straddles more than two cache sets");          \
 828    }                                                                        \
 829    return 0;                                                                \
 830 }
 831
 832
 833 /* logarithmic bitcounting algorithm, see
 834  * http://graphics.stanford.edu/~seander/bithacks.html
 835  */
 836 static __inline__ unsigned int countBits(unsigned int bits)
 837 {
 838   unsigned int c; // store the total here
 839   const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
 840   const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
 841
 842   c = bits;
 843   c = ((c >> S[0]) & B[0]) + (c & B[0]);
 844   c = ((c >> S[1]) & B[1]) + (c & B[1]);
 845   c = ((c >> S[2]) & B[2]) + (c & B[2]);
 846   c = ((c >> S[3]) & B[3]) + (c & B[3]);
 847   c = ((c >> S[4]) & B[4]) + (c & B[4]);
 848   return c;
 849 }
 850
 851 static void update_L2_use(int idx, Addr memline)
 852 {
 853   line_loaded* loaded = &(L2.loaded[idx]);
 854   line_use* use = &(L2.use[idx]);
 855   int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
 856
 857   CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
 858            idx, bb_base + current_ii->instr_offset, memline);
 859   if (use->count>0) {
 860     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
 861              use->count, i, use->mask, loaded->memline, loaded->iaddr);
 862     CLG_DEBUG(2, "   collect: %d, use_base %p\n",
 863              CLG_(current_state).collect, loaded->use_base);
 864
 865     if (CLG_(current_state).collect && loaded->use_base) {
 866       (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
 867       (loaded->use_base)[off_L2_SpLoss] += i;
 868     }
 869    }
 870
 871    use->count = 0;
 872    use->mask  = 0;
 873
 874   loaded->memline = memline;
 875   loaded->iaddr   = bb_base + current_ii->instr_offset;
 876   loaded->use_base = (CLG_(current_state).nonskipped) ?
 877     CLG_(current_state).nonskipped->skipped :
 878     cost_base + current_ii->cost_offset;
 879 }
 880
 881 static
 882 CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
 883 {
 884    UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
 885    UWord* set = &(L2.tags[setNo * L2.assoc]);
 886    UWord tag  = memline & L2.tag_mask;
 887
 888    int i, j, idx;
 889    UWord tmp_tag;
 890
 891    CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
 892
 893    if (tag == (set[0] & L2.tag_mask)) {
 894      idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
 895      l1_loaded->dep_use = &(L2.use[idx]);
 896
 897      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
 898                  idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
 899                  L2.use[idx].mask, L2.use[idx].count);
 900      return L2_Hit;
 901    }
 902    for (i = 1; i < L2.assoc; i++) {
 903      if (tag == (set[i] & L2.tag_mask)) {
 904        tmp_tag = set[i];
 905        for (j = i; j > 0; j--) {
 906          set[j] = set[j - 1];
 907        }
 908        set[0] = tmp_tag;
 909        idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
 910        l1_loaded->dep_use = &(L2.use[idx]);
 911
 912         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
 913                  i, idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
 914                  L2.use[idx].mask, L2.use[idx].count);
 915         return L2_Hit;
 916      }
 917    }
 918
 919    /* A miss;  install this tag as MRU, shuffle rest down. */
 920    tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
 921    for (j = L2.assoc - 1; j > 0; j--) {
 922      set[j] = set[j - 1];
 923    }
 924    set[0] = tag | tmp_tag;
 925    idx = (setNo * L2.assoc) + tmp_tag;
 926    l1_loaded->dep_use = &(L2.use[idx]);
 927
 928    update_L2_use(idx, memline);
 929
 930    return MemAccess;
 931 }
 932
 933
 934
 935
 936 #define UPDATE_USE(L)                                                \
 937                                                                      \
 938 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
 939                                UInt mask, Addr memline)              \
 940 {                                                                    \
 941   line_loaded* loaded = &(cache->loaded[idx]);                       \
 942   line_use* use = &(cache->use[idx]);                                \
 943   int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
 944                                                                      \
 945   CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
 946            cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
 947   if (use->count>0) {                                                \
 948     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
 949              use->count, c, use->mask, loaded->memline, loaded->iaddr); \
 950     CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
 951              CLG_(current_state).collect, loaded->use_base);         \
 952                                                                      \
 953     if (CLG_(current_state).collect && loaded->use_base) {            \
 954       (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
 955       (loaded->use_base)[off_##L##_SpLoss] += c;                     \
 956                                                                      \
 957       /* FIXME (?): L1/L2 line sizes must be equal ! */              \
 958       loaded->dep_use->mask |= use->mask;                            \
 959       loaded->dep_use->count += use->count;                          \
 960     }                                                                \
 961   }                                                                  \
 962                                                                      \
 963   use->count = 1;                                                    \
 964   use->mask  = mask;                                                 \
 965   loaded->memline = memline;                                         \
 966   loaded->iaddr   = bb_base + current_ii->instr_offset;              \
 967   loaded->use_base = (CLG_(current_state).nonskipped) ?               \
 968     CLG_(current_state).nonskipped->skipped :                         \
 969     cost_base + current_ii->cost_offset;                             \
 970                                                                      \
 971   if (memline == 0) return L2_Hit;                                   \
 972   return cacheuse_L2_access(memline, loaded);                        \
 973 }
 974
 975 UPDATE_USE(I1);
 976 UPDATE_USE(D1);
 977
 978 CACHEUSE(I1);
 979 CACHEUSE(D1);
 980
 981
 982 static
 983 void cacheuse_finish(void)
 984 {
 985   int i;
 986   InstrInfo ii = { 0,0,0,0 };
 987
 988   if (!CLG_(current_state).collect) return;
 989
 990   bb_base = 0;
 991   current_ii = &ii;
 992   cost_base = 0;
 993
 994   /* update usage counters */
 995   if (I1.use)
 996     for (i = 0; i < I1.sets * I1.assoc; i++)
 997       if (I1.loaded[i].use_base)
 998         update_I1_use( &I1, i, 0,0);
 999
1000   if (D1.use)
1001     for (i = 0; i < D1.sets * D1.assoc; i++)
1002       if (D1.loaded[i].use_base)
1003         update_D1_use( &D1, i, 0,0);
1004
1005   if (L2.use)
1006     for (i = 0; i < L2.sets * L2.assoc; i++)
1007       if (L2.loaded[i].use_base)
1008         update_L2_use(i, 0);
1009 }
1010
1011
1012
1013 /*------------------------------------------------------------*/
1014 /*--- Helper functions called by instrumented code         ---*/
1015 /*------------------------------------------------------------*/
1016
1017
1018 static __inline__
1019 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1020 {
1021     switch(r) {
1022         case WriteBackMemAccess:
1023             if (clo_simulate_writeback) {
1024                 c1[3]++;
1025                 c2[3]++;
1026             }
1027             // fall through
1028
1029         case MemAccess:
1030             c1[2]++;
1031             c2[2]++;
1032             // fall through
1033
1034         case L2_Hit:
1035             c1[1]++;
1036             c2[1]++;
1037             // fall through
1038
1039         default:
1040             c1[0]++;
1041             c2[0]++;
1042     }
1043 }
1044
1045 static
1046 Char* cacheRes(CacheModelResult r)
1047 {
1048     switch(r) {
1049     case L1_Hit:    return "L1 Hit ";
1050     case L2_Hit:    return "L2 Hit ";
1051     case MemAccess: return "L2 Miss";
1052     case WriteBackMemAccess: return "L2 Miss (dirty)";
1053     default:
1054         tl_assert(0);
1055     }
1056     return "??";
1057 }
1058
1059 VG_REGPARM(1)
1060 static void log_1I0D(InstrInfo* ii)
1061 {
1062     CacheModelResult IrRes;
1063
1064     current_ii = ii;
1065     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1066
1067     CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
1068               bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
1069
1070     if (CLG_(current_state).collect) {
1071         ULong* cost_Ir;
1072
1073         if (CLG_(current_state).nonskipped)
1074             cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1075         else
1076             cost_Ir = cost_base + ii->cost_offset + off_UIr_Ir;
1077
1078         inc_costs(IrRes, cost_Ir,
1079                   CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1080     }
1081 }
1082
1083 VG_REGPARM(2)
1084 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
1085 {
1086     CacheModelResult Ir1Res, Ir2Res;
1087     ULong *global_cost_Ir;
1088
1089     current_ii = ii1;
1090     Ir1Res = (*simulator.I1_Read)(bb_base + ii1->instr_offset, ii1->instr_size);
1091     current_ii = ii2;
1092     Ir2Res = (*simulator.I1_Read)(bb_base + ii2->instr_offset, ii2->instr_size);
1093
1094     CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1095               bb_base + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1096               bb_base + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
1097
1098     if (!CLG_(current_state).collect) return;
1099
1100     global_cost_Ir = CLG_(current_state).cost + CLG_(sets).off_full_Ir;
1101     if (CLG_(current_state).nonskipped) {
1102         ULong* skipped_cost_Ir = CLG_(current_state).nonskipped->skipped +
1103                                  CLG_(sets).off_full_Ir;
1104         inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1105         inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1106         return;
1107     }
1108
1109     inc_costs(Ir1Res, global_cost_Ir, cost_base + ii1->cost_offset + off_UIr_Ir);
1110     inc_costs(Ir2Res, global_cost_Ir, cost_base + ii2->cost_offset + off_UIr_Ir);
1111 }
1112
1113 VG_REGPARM(3)
1114 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
1115 {
1116     CacheModelResult Ir1Res, Ir2Res, Ir3Res;
1117     ULong *global_cost_Ir;
1118
1119     current_ii = ii1;
1120     Ir1Res = (*simulator.I1_Read)(bb_base + ii1->instr_offset, ii1->instr_size);
1121     current_ii = ii2;
1122     Ir2Res = (*simulator.I1_Read)(bb_base + ii2->instr_offset, ii2->instr_size);
1123     current_ii = ii3;
1124     Ir3Res = (*simulator.I1_Read)(bb_base + ii3->instr_offset, ii3->instr_size);
1125
1126     CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1127               bb_base + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1128               bb_base + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
1129               bb_base + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
1130
1131     if (!CLG_(current_state).collect) return;
1132
1133     global_cost_Ir = CLG_(current_state).cost + CLG_(sets).off_full_Ir;
1134     if (CLG_(current_state).nonskipped) {
1135         ULong* skipped_cost_Ir = CLG_(current_state).nonskipped->skipped +
1136                                  CLG_(sets).off_full_Ir;
1137         inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1138         inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1139         inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
1140         return;
1141     }
1142
1143     inc_costs(Ir1Res, global_cost_Ir, cost_base + ii1->cost_offset + off_UIr_Ir);
1144     inc_costs(Ir2Res, global_cost_Ir, cost_base + ii2->cost_offset + off_UIr_Ir);
1145     inc_costs(Ir3Res, global_cost_Ir, cost_base + ii3->cost_offset + off_UIr_Ir);
1146 }
1147
1148 /* Instruction doing a read access */
1149
1150 VG_REGPARM(3)
1151 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1152 {
1153     CacheModelResult IrRes, DrRes;
1154
1155     current_ii = ii;
1156     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1157     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1158
1159     CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
1160               bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1161               data_addr, data_size, cacheRes(DrRes));
1162
1163     if (CLG_(current_state).collect) {
1164         ULong *cost_Ir, *cost_Dr;
1165
1166         if (CLG_(current_state).nonskipped) {
1167             cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1168             cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1169         }
1170         else {
1171             // event set must be UIrDr or extension
1172             CLG_ASSERT((ii->eventset == CLG_(sets).UIrDr) ||
1173                        (ii->eventset == CLG_(sets).UIrDrDw));
1174             cost_Ir = cost_base + ii->cost_offset + off_UIrDr_Ir;
1175             cost_Dr = cost_base + ii->cost_offset + off_UIrDr_Dr;
1176         }
1177
1178         inc_costs(IrRes, cost_Ir,
1179                   CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1180         inc_costs(DrRes, cost_Dr,
1181                   CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1182     }
1183 }
1184
1185
1186 VG_REGPARM(3)
1187 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1188 {
1189     CacheModelResult DrRes;
1190
1191     current_ii = ii;
1192     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1193
1194     CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
1195               data_addr, data_size, cacheRes(DrRes));
1196
1197     if (CLG_(current_state).collect) {
1198         ULong *cost_Dr;
1199
1200         if (CLG_(current_state).nonskipped) {
1201             cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1202         }
1203         else {
1204             Int off_Dr;
1205             if      (ii->eventset == CLG_(sets).UIrDr)   off_Dr = off_UIrDr_Dr;
1206             else if (ii->eventset == CLG_(sets).UIrDrDw) off_Dr = off_UIrDrDw_Dr;
1207             else if (ii->eventset == CLG_(sets).UIrDwDr) off_Dr = off_UIrDwDr_Dr;
1208             else CLG_ASSERT(0);
1209
1210             cost_Dr = cost_base + ii->cost_offset + off_Dr;
1211         }
1212
1213         inc_costs(DrRes, cost_Dr,
1214                   CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1215     }
1216 }
1217
1218
1219 /* Instruction doing a write access */
1220
1221 VG_REGPARM(3)
1222 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1223 {
1224     CacheModelResult IrRes, DwRes;
1225
1226     current_ii = ii;
1227     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1228     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1229
1230     CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
1231               bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1232               data_addr, data_size, cacheRes(DwRes));
1233
1234     if (CLG_(current_state).collect) {
1235         ULong *cost_Ir, *cost_Dw;
1236
1237         if (CLG_(current_state).nonskipped) {
1238             cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1239             cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
1240         }
1241         else {
1242             // This helper is called when a Dr event follows Ir;
1243             // Event set must be UIrDw or extension
1244             CLG_ASSERT((ii->eventset == CLG_(sets).UIrDw) ||
1245                        (ii->eventset == CLG_(sets).UIrDwDr));
1246             cost_Ir = cost_base + ii->cost_offset + off_UIrDw_Ir;
1247             cost_Dw = cost_base + ii->cost_offset + off_UIrDw_Dw;
1248         }
1249
1250         inc_costs(IrRes, cost_Ir,
1251                   CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1252         inc_costs(DwRes, cost_Dw,
1253                   CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1254     }
1255 }
1256
1257 VG_REGPARM(3)
1258 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1259 {
1260     CacheModelResult DwRes;
1261
1262     current_ii = ii;
1263     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1264
1265     CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
1266               data_addr, data_size, cacheRes(DwRes));
1267
1268     if (CLG_(current_state).collect) {
1269         ULong *cost_Dw;
1270
1271         if (CLG_(current_state).nonskipped) {
1272             cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
1273         }
1274         else {
1275             Int off_Dw;
1276             if      (ii->eventset == CLG_(sets).UIrDw)   off_Dw = off_UIrDw_Dw;
1277             else if (ii->eventset == CLG_(sets).UIrDwDr) off_Dw = off_UIrDwDr_Dw;
1278             else if (ii->eventset == CLG_(sets).UIrDrDw) off_Dw = off_UIrDrDw_Dw;
1279             else CLG_ASSERT(0);
1280
1281             cost_Dw = cost_base + ii->cost_offset + off_Dw;
1282         }
1283
1284         inc_costs(DwRes, cost_Dw,
1285                   CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1286     }
1287 }
1288
1289
1290
1291 /*------------------------------------------------------------*/
1292 /*--- Cache configuration                                  ---*/
1293 /*------------------------------------------------------------*/
1294
1295 #define UNDEFINED_CACHE     ((cache_t) { -1, -1, -1 })
1296
1297 static cache_t clo_I1_cache = UNDEFINED_CACHE;
1298 static cache_t clo_D1_cache = UNDEFINED_CACHE;
1299 static cache_t clo_L2_cache = UNDEFINED_CACHE;
1300
1301
1302 /* Checks cache config is ok;  makes it so if not. */
1303 static
1304 void check_cache(cache_t* cache, Char *name)
1305 {
1306    /* Simulator requires line size and set count to be powers of two */
1307    if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
1308        (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
1309       VG_(message)(Vg_UserMsg,
1310          "error: %s set count not a power of two; aborting.\n",
1311          name);
1312    }
1313
1314    if (-1 == VG_(log2)(cache->line_size)) {
1315       VG_(message)(Vg_UserMsg,
1316          "error: %s line size of %dB not a power of two; aborting.\n",
1317          name, cache->line_size);
1318       VG_(exit)(1);
1319    }
1320
1321    // Then check line size >= 16 -- any smaller and a single instruction could
1322    // straddle three cache lines, which breaks a simulation assertion and is
1323    // stupid anyway.
1324    if (cache->line_size < MIN_LINE_SIZE) {
1325       VG_(message)(Vg_UserMsg,
1326          "error: %s line size of %dB too small; aborting.\n",
1327          name, cache->line_size);
1328       VG_(exit)(1);
1329    }
1330
1331    /* Then check cache size > line size (causes seg faults if not). */
1332    if (cache->size <= cache->line_size) {
1333       VG_(message)(Vg_UserMsg,
1334          "error: %s cache size of %dB <= line size of %dB; aborting.\n",
1335          name, cache->size, cache->line_size);
1336       VG_(exit)(1);
1337    }
1338
1339    /* Then check assoc <= (size / line size) (seg faults otherwise). */
1340    if (cache->assoc > (cache->size / cache->line_size)) {
1341       VG_(message)(Vg_UserMsg,
1342          "warning: %s associativity > (size / line size); aborting.\n", name);
1343       VG_(exit)(1);
1344    }
1345 }
1346
1347 static
1348 void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
1349 {
1350 #define DEFINED(L)   (-1 != L.size  || -1 != L.assoc || -1 != L.line_size)
1351
1352    Int n_clos = 0;
1353
1354    // Count how many were defined on the command line.
1355    if (DEFINED(clo_I1_cache)) { n_clos++; }
1356    if (DEFINED(clo_D1_cache)) { n_clos++; }
1357    if (DEFINED(clo_L2_cache)) { n_clos++; }
1358
1359    // Set the cache config (using auto-detection, if supported by the
1360    // architecture)
1361    VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
1362
1363    // Then replace with any defined on the command line.
1364    if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1365    if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1366    if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
1367
1368    // Then check values and fix if not acceptable.
1369    check_cache(I1c, "I1");
1370    check_cache(D1c, "D1");
1371    check_cache(L2c, "L2");
1372
1373    if (VG_(clo_verbosity) > 1) {
1374       VG_(message)(Vg_UserMsg, "Cache configuration used:\n");
1375       VG_(message)(Vg_UserMsg, "  I1: %dB, %d-way, %dB lines\n",
1376                                I1c->size, I1c->assoc, I1c->line_size);
1377       VG_(message)(Vg_UserMsg, "  D1: %dB, %d-way, %dB lines\n",
1378                                D1c->size, D1c->assoc, D1c->line_size);
1379       VG_(message)(Vg_UserMsg, "  L2: %dB, %d-way, %dB lines\n",
1380                                L2c->size, L2c->assoc, L2c->line_size);
1381    }
1382 #undef CMD_LINE_DEFINED
1383 }
1384
1385
1386 /* Initialize and clear simulator state */
1387 static void cachesim_post_clo_init(void)
1388 {
1389   /* Cache configurations. */
1390   cache_t  I1c, D1c, L2c;
1391
1392   /* Initialize access handlers */
1393   if (!CLG_(clo).simulate_cache) {
1394     CLG_(cachesim).log_1I0D  = 0;
1395     CLG_(cachesim).log_1I0D_name = "(no function)";
1396     CLG_(cachesim).log_2I0D  = 0;
1397     CLG_(cachesim).log_2I0D_name = "(no function)";
1398     CLG_(cachesim).log_3I0D  = 0;
1399     CLG_(cachesim).log_3I0D_name = "(no function)";
1400
1401     CLG_(cachesim).log_1I1Dr = 0;
1402     CLG_(cachesim).log_1I1Dr_name = "(no function)";
1403     CLG_(cachesim).log_1I1Dw = 0;
1404     CLG_(cachesim).log_1I1Dw_name = "(no function)";
1405
1406     CLG_(cachesim).log_0I1Dr = 0;
1407     CLG_(cachesim).log_0I1Dr_name = "(no function)";
1408     CLG_(cachesim).log_0I1Dw = 0;
1409     CLG_(cachesim).log_0I1Dw_name = "(no function)";
1410     return;
1411   }
1412
1413   /* Configuration of caches only needed with real cache simulation */
1414   configure_caches(&I1c, &D1c, &L2c);
1415
1416   I1.name = "I1";
1417   D1.name = "D1";
1418   L2.name = "L2";
1419
1420   cachesim_initcache(I1c, &I1);
1421   cachesim_initcache(D1c, &D1);
1422   cachesim_initcache(L2c, &L2);
1423
1424   /* the other cache simulators use the standard helpers
1425    * with dispatching via simulator struct */
1426
1427   CLG_(cachesim).log_1I0D  = log_1I0D;
1428   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
1429   CLG_(cachesim).log_2I0D  = log_2I0D;
1430   CLG_(cachesim).log_2I0D_name  = "log_2I0D";
1431   CLG_(cachesim).log_3I0D  = log_3I0D;
1432   CLG_(cachesim).log_3I0D_name  = "log_3I0D";
1433
1434   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1435   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1436   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1437   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1438
1439   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1440   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1441   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1442   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1443
1444   if (clo_collect_cacheuse) {
1445
1446       /* Output warning for not supported option combinations */
1447       if (clo_simulate_hwpref) {
1448           VG_(message)(Vg_DebugMsg,
1449                        "warning: prefetch simulation can not be "
1450                        "used with cache usage\n");
1451           clo_simulate_hwpref = False;
1452       }
1453
1454       if (clo_simulate_writeback) {
1455           VG_(message)(Vg_DebugMsg,
1456                        "warning: write-back simulation can not be "
1457                        "used with cache usage\n");
1458           clo_simulate_writeback = False;
1459       }
1460
1461       simulator.I1_Read  = cacheuse_I1_doRead;
1462       simulator.D1_Read  = cacheuse_D1_doRead;
1463       simulator.D1_Write = cacheuse_D1_doRead;
1464       return;
1465   }
1466
1467   if (clo_simulate_hwpref) {
1468     prefetch_clear();
1469
1470     if (clo_simulate_writeback) {
1471       simulator.I1_Read  = prefetch_I1_Read;
1472       simulator.D1_Read  = prefetch_D1_Read;
1473       simulator.D1_Write = prefetch_D1_Write;
1474     }
1475     else {
1476       simulator.I1_Read  = prefetch_I1_ref;
1477       simulator.D1_Read  = prefetch_D1_ref;
1478       simulator.D1_Write = prefetch_D1_ref;
1479     }
1480
1481     return;
1482   }
1483
1484   if (clo_simulate_writeback) {
1485       simulator.I1_Read  = cachesim_I1_Read;
1486       simulator.D1_Read  = cachesim_D1_Read;
1487       simulator.D1_Write = cachesim_D1_Write;
1488   }
1489   else {
1490       simulator.I1_Read  = cachesim_I1_ref;
1491       simulator.D1_Read  = cachesim_D1_ref;
1492       simulator.D1_Write = cachesim_D1_ref;
1493   }
1494 }
1495
1496
1497 /* Clear simulator state. Has to be initialized before */
1498 static
1499 void cachesim_clear(void)
1500 {
1501   cachesim_clearcache(&I1);
1502   cachesim_clearcache(&D1);
1503   cachesim_clearcache(&L2);
1504
1505   prefetch_clear();
1506 }
1507
1508
1509 static void cachesim_getdesc(Char* buf)
1510 {
1511   Int p;
1512   p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1513   p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1514   VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
1515 }
1516
1517 static
1518 void cachesim_print_opts(void)
1519 {
1520   VG_(printf)(
1521 "\n   cache simulator options:\n"
1522 "    --simulate-cache=no|yes   Do cache simulation [no]\n"
1523 "    --simulate-wb=no|yes      Count write-back events [no]\n"
1524 "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
1525 #if CLG_EXPERIMENTAL
1526 "    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1527 #endif
1528 "    --cacheuse=no|yes         Collect cache block use [no]\n"
1529 "    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
1530 "    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
1531 "    --L2=<size>,<assoc>,<line_size>  set L2 cache manually\n"
1532               );
1533 }
1534
1535 static void parse_opt ( cache_t* cache, char* opt )
1536 {
1537    Long i1, i2, i3;
1538    Char* endptr;
1539
1540    // Option argument looks like "65536,2,64".  Extract them.
1541    i1 = VG_(strtoll10)(opt,      &endptr); if (*endptr != ',')  goto bad;
1542    i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',')  goto bad;
1543    i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
1544
1545    // Check for overflow.
1546    cache->size      = (Int)i1;
1547    cache->assoc     = (Int)i2;
1548    cache->line_size = (Int)i3;
1549    if (cache->size      != i1) goto overflow;
1550    if (cache->assoc     != i2) goto overflow;
1551    if (cache->line_size != i3) goto overflow;
1552
1553    return;
1554
1555   overflow:
1556    VG_(message)(Vg_UserMsg,
1557                 "one of the cache parameters was too large and overflowed\n");
1558   bad:
1559    // XXX: this omits the "--I1/D1/L2=" part from the message, but that's
1560    // not a big deal.
1561    VG_(err_bad_option)(opt);
1562 }
1563
1564 /* Check for command line option for cache configuration.
1565  * Return False if unknown and not handled.
1566  *
1567  * Called from CLG_(process_cmd_line_option)() in clo.c
1568  */
1569 static Bool cachesim_parse_opt(Char* arg)
1570 {
1571    Char* tmp_str;
1572
1573    if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
1574    else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
1575    else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
1576
1577    else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1578       if (clo_collect_cacheuse) {
1579          /* Use counters only make sense with fine dumping */
1580          CLG_(clo).dump_instr = True;
1581       }
1582    }
1583
1584    else if VG_STR_CLO(arg, "--I1", tmp_str)
1585       parse_opt(&clo_I1_cache, tmp_str);
1586    else if VG_STR_CLO(arg, "--D1", tmp_str)
1587       parse_opt(&clo_D1_cache, tmp_str);
1588    else if VG_STR_CLO(arg, "--L2", tmp_str)
1589       parse_opt(&clo_L2_cache, tmp_str);
1590   else
1591     return False;
1592
1593   return True;
1594 }
1595
1596 /* Adds commas to ULong, right justifying in a field field_width wide, returns
1597  * the string in buf. */
1598 static
1599 Int commify(ULong n, int field_width, char* buf)
1600 {
1601    int len, n_commas, i, j, new_len, space;
1602
1603    VG_(sprintf)(buf, "%llu", n);
1604    len = VG_(strlen)(buf);
1605    n_commas = (len - 1) / 3;
1606    new_len = len + n_commas;
1607    space = field_width - new_len;
1608
1609    /* Allow for printing a number in a field_width smaller than it's size */
1610    if (space < 0) space = 0;
1611
1612    /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1613     * of three. */
1614    for (j = -1, i = len ; i >= 0; i--) {
1615       buf[i + n_commas + space] = buf[i];
1616
1617       if ((i>0) && (3 == ++j)) {
1618          j = 0;
1619          n_commas--;
1620          buf[i + n_commas + space] = ',';
1621       }
1622    }
1623    /* Right justify in field. */
1624    for (i = 0; i < space; i++)  buf[i] = ' ';
1625    return new_len;
1626 }
1627
1628 static
1629 void percentify(Int n, Int ex, Int field_width, char buf[])
1630 {
1631    int i, len, space;
1632
1633    VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1634    len = VG_(strlen)(buf);
1635    space = field_width - len;
1636    if (space < 0) space = 0;     /* Allow for v. small field_width */
1637    i = len;
1638
1639    /* Right justify in field */
1640    for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
1641    for (i = 0; i < space; i++)  buf[i] = ' ';
1642 }
1643
1644 static
1645 void cachesim_printstat(void)
1646 {
1647   FullCost total = CLG_(total_cost), D_total = 0;
1648   ULong L2_total_m, L2_total_mr, L2_total_mw,
1649     L2_total, L2_total_r, L2_total_w;
1650   char buf1[RESULTS_BUF_LEN],
1651     buf2[RESULTS_BUF_LEN],
1652     buf3[RESULTS_BUF_LEN];
1653   Int l1, l2, l3;
1654   Int p;
1655
1656   if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1657     VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n",
1658                  prefetch_up);
1659     VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n",
1660                  prefetch_down);
1661     VG_(message)(Vg_DebugMsg, "\n");
1662   }
1663
1664   /* I cache results.  Use the I_refs value to determine the first column
1665    * width. */
1666   l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
1667   VG_(message)(Vg_UserMsg, "I   refs:      %s\n", buf1);
1668
1669   if (!CLG_(clo).simulate_cache) return;
1670
1671   commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
1672   VG_(message)(Vg_UserMsg, "I1  misses:    %s\n", buf1);
1673
1674   commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
1675   VG_(message)(Vg_UserMsg, "L2i misses:    %s\n", buf1);
1676
1677   p = 100;
1678
1679   if (0 == total[CLG_(sets).off_full_Ir])
1680     total[CLG_(sets).off_full_Ir] = 1;
1681
1682   percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
1683              total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1684   VG_(message)(Vg_UserMsg, "I1  miss rate: %s\n", buf1);
1685
1686   percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
1687              total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1688   VG_(message)(Vg_UserMsg, "L2i miss rate: %s\n", buf1);
1689   VG_(message)(Vg_UserMsg, "\n");
1690
1691   /* D cache results.
1692      Use the D_refs.rd and D_refs.wr values to determine the
1693    * width of columns 2 & 3. */
1694
1695   D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1696   CLG_(init_cost)( CLG_(sets).full, D_total);
1697   CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
1698   CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
1699
1700   commify( D_total[0], l1, buf1);
1701   l2 = commify(total[CLG_(sets).off_full_Dr], 0,  buf2);
1702   l3 = commify(total[CLG_(sets).off_full_Dw], 0,  buf3);
1703   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)\n",
1704                buf1,  buf2,  buf3);
1705
1706   commify( D_total[1], l1, buf1);
1707   commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
1708   commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
1709   VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)\n",
1710                buf1, buf2, buf3);
1711
1712   commify( D_total[2], l1, buf1);
1713   commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
1714   commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
1715   VG_(message)(Vg_UserMsg, "L2d misses:    %s  (%s rd + %s wr)\n",
1716                buf1, buf2, buf3);
1717
1718   p = 10;
1719
1720   if (0 == D_total[0])   D_total[0] = 1;
1721   if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
1722   if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
1723
1724   percentify( D_total[1] * 100 * p / D_total[0],  p, l1+1, buf1);
1725   percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
1726              total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1727   percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
1728              total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1729   VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )\n",
1730                buf1, buf2,buf3);
1731
1732   percentify( D_total[2] * 100 * p / D_total[0],  p, l1+1, buf1);
1733   percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
1734              total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1735   percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
1736              total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1737   VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )\n",
1738                buf1, buf2,buf3);
1739   VG_(message)(Vg_UserMsg, "\n");
1740
1741
1742
1743   /* L2 overall results */
1744
1745   L2_total   =
1746     total[CLG_(sets).off_full_Dr +1] +
1747     total[CLG_(sets).off_full_Dw +1] +
1748     total[CLG_(sets).off_full_Ir +1];
1749   L2_total_r =
1750     total[CLG_(sets).off_full_Dr +1] +
1751     total[CLG_(sets).off_full_Ir +1];
1752   L2_total_w = total[CLG_(sets).off_full_Dw +1];
1753   commify(L2_total,   l1, buf1);
1754   commify(L2_total_r, l2, buf2);
1755   commify(L2_total_w, l3, buf3);
1756   VG_(message)(Vg_UserMsg, "L2 refs:       %s  (%s rd + %s wr)\n",
1757                buf1, buf2, buf3);
1758
1759   L2_total_m  =
1760     total[CLG_(sets).off_full_Dr +2] +
1761     total[CLG_(sets).off_full_Dw +2] +
1762     total[CLG_(sets).off_full_Ir +2];
1763   L2_total_mr =
1764     total[CLG_(sets).off_full_Dr +2] +
1765     total[CLG_(sets).off_full_Ir +2];
1766   L2_total_mw = total[CLG_(sets).off_full_Dw +2];
1767   commify(L2_total_m,  l1, buf1);
1768   commify(L2_total_mr, l2, buf2);
1769   commify(L2_total_mw, l3, buf3);
1770   VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)\n",
1771                buf1, buf2, buf3);
1772
1773   percentify(L2_total_m  * 100 * p /
1774              (total[CLG_(sets).off_full_Ir] + D_total[0]),  p, l1+1, buf1);
1775   percentify(L2_total_mr * 100 * p /
1776              (total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
1777              p, l2+1, buf2);
1778   percentify(L2_total_mw * 100 * p /
1779              total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1780   VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )\n",
1781                buf1, buf2,buf3);
1782 }
1783
1784
1785 /*------------------------------------------------------------*/
1786 /*--- Setup for Event set.                                 ---*/
1787 /*------------------------------------------------------------*/
1788
1789 struct event_sets CLG_(sets);
1790
1791 void CLG_(init_eventsets)(Int max_user)
1792 {
1793   EventType * e1, *e2, *e3, *e4;
1794   // Basic event sets from which others are composed
1795   EventSet *Use, *Ir, *Dr, *Dw;
1796   // Compositions of basic sets used for per-instruction counters
1797   EventSet *UIr, *UIrDr, *UIrDrDw, *UIrDw, *UIrDwDr;
1798   // Composition used for global counters and aggregation
1799   EventSet *full;
1800   int sizeOfUseIr;
1801
1802   // the "Use" events types only are used with "cacheuse" simulation
1803   Use = CLG_(get_eventset)("Use", 4);
1804   if (clo_collect_cacheuse) {
1805     /* if TUse is 0, there was never a load, and no loss, too */
1806     e1 = CLG_(register_eventtype)("AcCost1");
1807     CLG_(add_eventtype)(Use, e1);
1808     e1 = CLG_(register_eventtype)("SpLoss1");
1809     CLG_(add_eventtype)(Use, e1);
1810     e1 = CLG_(register_eventtype)("AcCost2");
1811     CLG_(add_eventtype)(Use, e1);
1812     e1 = CLG_(register_eventtype)("SpLoss2");
1813     CLG_(add_eventtype)(Use, e1);
1814   }
1815
1816   Ir = CLG_(get_eventset)("Ir", 4);
1817   Dr = CLG_(get_eventset)("Dr", 4);
1818   Dw = CLG_(get_eventset)("Dw", 4);
1819   if (CLG_(clo).simulate_cache) {
1820     e1 = CLG_(register_eventtype)("Ir");
1821     e2 = CLG_(register_eventtype)("I1mr");
1822     e3 = CLG_(register_eventtype)("I2mr");
1823     if (clo_simulate_writeback) {
1824       e4 = CLG_(register_eventtype)("I2dmr");
1825       CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
1826     }
1827     else
1828       CLG_(add_dep_event3)(Ir, e1,e2,e3);
1829
1830     e1 = CLG_(register_eventtype)("Dr");
1831     e2 = CLG_(register_eventtype)("D1mr");
1832     e3 = CLG_(register_eventtype)("D2mr");
1833     if (clo_simulate_writeback) {
1834       e4 = CLG_(register_eventtype)("D2dmr");
1835       CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
1836     }
1837     else
1838       CLG_(add_dep_event3)(Dr, e1,e2,e3);
1839
1840     e1 = CLG_(register_eventtype)("Dw");
1841     e2 = CLG_(register_eventtype)("D1mw");
1842     e3 = CLG_(register_eventtype)("D2mw");
1843     if (clo_simulate_writeback) {
1844       e4 = CLG_(register_eventtype)("D2dmw");
1845       CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
1846     }
1847     else
1848       CLG_(add_dep_event3)(Dw, e1,e2,e3);
1849
1850   }
1851   else {
1852     e1 = CLG_(register_eventtype)("Ir");
1853     CLG_(add_eventtype)(Ir, e1);
1854   }
1855
1856   // Self cost event sets per guest instruction (U used only for cacheUse).
1857   // Each basic event set only appears once, as eg. multiple different Dr's
1858   // in one guest instruction are counted in the same counter.
1859
1860   sizeOfUseIr =  Use->size + Ir->size;
1861   UIr = CLG_(get_eventset)("UIr", sizeOfUseIr);
1862   CLG_(add_eventset)(UIr, Use);
1863   off_UIr_Ir  = CLG_(add_eventset)(UIr, Ir);
1864
1865   UIrDr = CLG_(get_eventset)("UIrDr", sizeOfUseIr + Dr->size);
1866   CLG_(add_eventset)(UIrDr, Use);
1867   off_UIrDr_Ir = CLG_(add_eventset)(UIrDr, Ir);
1868   off_UIrDr_Dr = CLG_(add_eventset)(UIrDr, Dr);
1869
1870   UIrDrDw  = CLG_(get_eventset)("IrDrDw", sizeOfUseIr + Dr->size + Dw->size);
1871   CLG_(add_eventset)(UIrDrDw, Use);
1872   off_UIrDrDw_Ir    = CLG_(add_eventset)(UIrDrDw, Ir);
1873   off_UIrDrDw_Dr    = CLG_(add_eventset)(UIrDrDw, Dr);
1874   off_UIrDrDw_Dw    = CLG_(add_eventset)(UIrDrDw, Dw);
1875
1876   UIrDw = CLG_(get_eventset)("UIrDw", sizeOfUseIr + Dw->size);
1877   CLG_(add_eventset)(UIrDw, Use);
1878   off_UIrDw_Ir   = CLG_(add_eventset)(UIrDw, Ir);
1879   off_UIrDw_Dw   = CLG_(add_eventset)(UIrDw, Dw);
1880
1881   UIrDwDr  = CLG_(get_eventset)("IrDwDr", sizeOfUseIr + Dw->size + Dr->size);
1882   CLG_(add_eventset)(UIrDwDr, Use);
1883   off_UIrDwDr_Ir    = CLG_(add_eventset)(UIrDrDw, Ir);
1884   off_UIrDwDr_Dw    = CLG_(add_eventset)(UIrDrDw, Dw);
1885   off_UIrDwDr_Dr    = CLG_(add_eventset)(UIrDrDw, Dr);
1886
1887
1888   // the "full" event set is used as global counter and for aggregation
1889   if (CLG_(clo).collect_alloc)   max_user += 2;
1890   if (CLG_(clo).collect_systime) max_user += 2;
1891   full = CLG_(get_eventset)("full",
1892                             sizeOfUseIr + Dr->size + Dw->size + max_user);
1893   CLG_(add_eventset)(full, Use);
1894   CLG_(sets).off_full_Ir   = CLG_(add_eventset)(full, Ir);
1895   CLG_(sets).off_full_Dr   = CLG_(add_eventset)(full, Dr);
1896   CLG_(sets).off_full_Dw   = CLG_(add_eventset)(full, Dw);
1897   if (CLG_(clo).collect_alloc) {
1898       e1 = CLG_(register_eventtype)("allocCount");
1899       e2 = CLG_(register_eventtype)("allocSize");
1900       CLG_(sets).off_full_alloc =  CLG_(add_dep_event2)(full, e1,e2);
1901   }
1902   if (CLG_(clo).collect_systime) {
1903       e1 = CLG_(register_eventtype)("sysCount");
1904       e2 = CLG_(register_eventtype)("sysTime");
1905       CLG_(sets).off_full_systime =  CLG_(add_dep_event2)(full, e1,e2);
1906   }
1907
1908   CLG_(sets).Use = Use;
1909   CLG_(sets).Ir  = Ir;
1910   CLG_(sets).Dr  = Dr;
1911   CLG_(sets).Dw  = Dw;
1912   CLG_(sets).UIr  = UIr;
1913   CLG_(sets).UIrDr = UIrDr;
1914   CLG_(sets).UIrDrDw  = UIrDrDw;
1915   CLG_(sets).UIrDw = UIrDw;
1916   CLG_(sets).UIrDwDr  = UIrDwDr;
1917   CLG_(sets).full = full;
1918
1919
1920   CLG_DEBUGIF(1) {
1921     CLG_DEBUG(1, "EventSets:\n");
1922     CLG_(print_eventset)(-2, Use);
1923     CLG_(print_eventset)(-2, Ir);
1924     CLG_(print_eventset)(-2, Dr);
1925     CLG_(print_eventset)(-2, Dw);
1926     CLG_(print_eventset)(-2, full);
1927   }
1928
1929   /* Not-existing events are silently ignored */
1930   CLG_(dumpmap) = CLG_(get_eventmapping)(full);
1931   CLG_(append_event)(CLG_(dumpmap), "Ir");
1932   CLG_(append_event)(CLG_(dumpmap), "Dr");
1933   CLG_(append_event)(CLG_(dumpmap), "Dw");
1934   CLG_(append_event)(CLG_(dumpmap), "I1mr");
1935   CLG_(append_event)(CLG_(dumpmap), "D1mr");
1936   CLG_(append_event)(CLG_(dumpmap), "D1mw");
1937   CLG_(append_event)(CLG_(dumpmap), "I2mr");
1938   CLG_(append_event)(CLG_(dumpmap), "D2mr");
1939   CLG_(append_event)(CLG_(dumpmap), "D2mw");
1940   CLG_(append_event)(CLG_(dumpmap), "I2dmr");
1941   CLG_(append_event)(CLG_(dumpmap), "D2dmr");
1942   CLG_(append_event)(CLG_(dumpmap), "D2dmw");
1943   CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1944   CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1945   CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1946   CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1947   CLG_(append_event)(CLG_(dumpmap), "allocCount");
1948   CLG_(append_event)(CLG_(dumpmap), "allocSize");
1949   CLG_(append_event)(CLG_(dumpmap), "sysCount");
1950   CLG_(append_event)(CLG_(dumpmap), "sysTime");
1951
1952 }
1953
1954
1955
1956 static
1957 void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
1958 {
1959   /* if eventset use is defined, it is always first (hardcoded!) */
1960   CLG_(add_and_zero_cost)( CLG_(sets).Use, dst, cost);
1961
1962   if (es == CLG_(sets).UIr) {
1963     CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
1964                             cost + off_UIr_Ir);
1965   }
1966   else if (es == CLG_(sets).UIrDr) {
1967     CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
1968                             cost + off_UIrDr_Ir);
1969     CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
1970                             cost + off_UIrDr_Dr);
1971   }
1972   else if (es == CLG_(sets).UIrDrDw) {
1973     CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
1974                             cost + off_UIrDrDw_Ir);
1975     CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
1976                             cost + off_UIrDrDw_Dr);
1977     CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
1978                             cost + off_UIrDrDw_Dw);
1979   }
1980   else if (es == CLG_(sets).UIrDw) {
1981       CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
1982                                cost + off_UIrDw_Ir);
1983       CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
1984                                cost + off_UIrDw_Dw);
1985   }
1986   else if (es == CLG_(sets).UIrDwDr) {
1987     CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
1988                             cost + off_UIrDwDr_Ir);
1989     CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
1990                             cost + off_UIrDwDr_Dw);
1991     CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
1992                              cost + off_UIrDwDr_Dr);
1993   }
1994   else CLG_ASSERT(0);
1995 }
1996
1997 /* this is called at dump time for every instruction executed */
1998 static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1999                                InstrInfo* ii, ULong exe_count)
2000 {
2001   if (!CLG_(clo).simulate_cache)
2002       cost[CLG_(sets).off_full_Ir] += exe_count;
2003   else {
2004
2005 #if 0
2006 /* There is always a trivial case where exe_count and Ir can be
2007  * slightly different because ecounter is updated when executing
2008  * the next BB. E.g. for last BB executed, or when toggling collection
2009  */
2010       /* FIXME: Hardcoded that each eventset has Ir as first */
2011       if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
2012           VG_(printf)("==> Ir %llu, exe %llu\n",
2013                       (bbcc->cost + ii->cost_offset)[0], exe_count);
2014           CLG_(print_bbcc_cost)(-2, bbcc);
2015           //CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
2016       }
2017 #endif
2018
2019       add_and_zero_Dx(ii->eventset, cost,
2020                       bbcc->cost + ii->cost_offset);
2021   }
2022 }
2023
2024 static
2025 void cachesim_after_bbsetup(void)
2026 {
2027   BBCC* bbcc = CLG_(current_state).bbcc;
2028
2029   if (CLG_(clo).simulate_cache) {
2030     BB* bb = bbcc->bb;
2031
2032     /* only needed if log_* functions are called */
2033     bb_base   = bb->obj->offset + bb->offset;
2034     cost_base = bbcc->cost;
2035   }
2036 }
2037
2038 static
2039 void cachesim_finish(void)
2040 {
2041   if (clo_collect_cacheuse)
2042     cacheuse_finish();
2043 }
2044
2045 /*------------------------------------------------------------*/
2046 /*--- The simulator defined in this file                   ---*/
2047 /*------------------------------------------------------------*/
2048
2049 struct cachesim_if CLG_(cachesim) = {
2050   .print_opts    = cachesim_print_opts,
2051   .parse_opt     = cachesim_parse_opt,
2052   .post_clo_init = cachesim_post_clo_init,
2053   .clear         = cachesim_clear,
2054   .getdesc       = cachesim_getdesc,
2055   .printstat     = cachesim_printstat,
2056   .add_icost     = cachesim_add_icost,
2057   .after_bbsetup = cachesim_after_bbsetup,
2058   .finish        = cachesim_finish,
2059
2060   /* these will be set by cachesim_post_clo_init */
2061   .log_1I0D        = 0,
2062   .log_2I0D        = 0,
2063   .log_3I0D        = 0,
2064
2065   .log_1I1Dr       = 0,
2066   .log_1I1Dw       = 0,
2067
2068   .log_0I1Dr       = 0,
2069   .log_0I1Dw       = 0,
2070
2071   .log_1I0D_name = "(no function)",
2072   .log_2I0D_name = "(no function)",
2073   .log_3I0D_name = "(no function)",
2074
2075   .log_1I1Dr_name = "(no function)",
2076   .log_1I1Dw_name = "(no function)",
2077
2078   .log_0I1Dr_name = "(no function)",
2079   .log_0I1Dw_name = "(no function)",
2080 };
2081
2082
2083 /*--------------------------------------------------------------------*/
2084 /*--- end                                                 ct_sim.c ---*/
2085 /*--------------------------------------------------------------------*/
2086