l4/pkg/valgrind/src/valgrind-3.6.0-svn/callgrind/sim.c

   1 /*--------------------------------------------------------------------*/
   2 /*--- Cache simulation.                                            ---*/
   3 /*---                                                        sim.c ---*/
   4 /*--------------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Callgrind, a Valgrind tool for call graph
   8    profiling programs.
   9
  10    Copyright (C) 2003-2010, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
  11
  12    This tool is derived from and contains code from Cachegrind
  13    Copyright (C) 2002-2010 Nicholas Nethercote (njn@valgrind.org)
  14
  15    This program is free software; you can redistribute it and/or
  16    modify it under the terms of the GNU General Public License as
  17    published by the Free Software Foundation; either version 2 of the
  18    License, or (at your option) any later version.
  19
  20    This program is distributed in the hope that it will be useful, but
  21    WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23    General Public License for more details.
  24
  25    You should have received a copy of the GNU General Public License
  26    along with this program; if not, write to the Free Software
  27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  28    02111-1307, USA.
  29
  30    The GNU General Public License is contained in the file COPYING.
  31 */
  32
  33 #include "global.h"
  34
  35
  36 /* Notes:
  37   - simulates a write-allocate cache
  38   - (block --> set) hash function uses simple bit selection
  39   - handling of references straddling two cache blocks:
  40       - counts as only one cache access (not two)
  41       - both blocks hit                  --> one hit
  42       - one block hits, the other misses --> one miss
  43       - both blocks miss                 --> one miss (not two)
  44 */
  45
  46 /* Cache configuration */
  47 #include "cg_arch.h"
  48
  49 /* additional structures for cache use info, separated
  50  * according usage frequency:
  51  * - line_loaded : pointer to cost center of instruction
  52  *                 which loaded the line into cache.
  53  *                 Needed to increment counters when line is evicted.
  54  * - line_use    : updated on every access
  55  */
  56 typedef struct {
  57   UInt count;
  58   UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
  59 } line_use;
  60
  61 typedef struct {
  62   Addr memline, iaddr;
  63   line_use* dep_use; /* point to higher-level cacheblock for this memline */
  64   ULong* use_base;
  65 } line_loaded;
  66
  67 /* Cache state */
  68 typedef struct {
  69    char*        name;
  70    int          size;                   /* bytes */
  71    int          assoc;
  72    int          line_size;              /* bytes */
  73    Bool         sectored;  /* prefetch nearside cacheline on read */
  74    int          sets;
  75    int          sets_min_1;
  76    int          line_size_bits;
  77    int          tag_shift;
  78    UWord        tag_mask;
  79    char         desc_line[128];
  80    UWord*       tags;
  81
  82   /* for cache use */
  83    int          line_size_mask;
  84    int*         line_start_mask;
  85    int*         line_end_mask;
  86    line_loaded* loaded;
  87    line_use*    use;
  88 } cache_t2;
  89
  90 /*
  91  * States of flat caches in our model.
  92  * We use a 2-level hierarchy,
  93  */
  94 static cache_t2 I1, D1, LL;
  95
  96 /* Lower bits of cache tags are used as flags for a cache line */
  97 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
  98 #define CACHELINE_DIRTY    1
  99
 100
 101 /* Cache simulator Options */
 102 static Bool clo_simulate_writeback = False;
 103 static Bool clo_simulate_hwpref = False;
 104 static Bool clo_simulate_sectors = False;
 105 static Bool clo_collect_cacheuse = False;
 106
 107 /* Following global vars are setup before by setup_bbcc():
 108  *
 109  * - Addr   CLG_(bb_base)     (instruction start address of original BB)
 110  * - ULong* CLG_(cost_base)   (start of cost array for BB)
 111  */
 112
 113 Addr   CLG_(bb_base);
 114 ULong* CLG_(cost_base);
 115
 116 static InstrInfo* current_ii;
 117
 118 /* Cache use offsets */
 119 /* The offsets are only correct because all per-instruction event sets get
 120  * the "Use" set added first !
 121  */
 122 static Int off_I1_AcCost  = 0;
 123 static Int off_I1_SpLoss  = 1;
 124 static Int off_D1_AcCost  = 0;
 125 static Int off_D1_SpLoss  = 1;
 126 static Int off_LL_AcCost  = 2;
 127 static Int off_LL_SpLoss  = 3;
 128
 129 /* Cache access types */
 130 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
 131
 132 /* Result of a reference into a flat cache */
 133 typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
 134
 135 /* Result of a reference into a hierarchical cache model */
 136 typedef enum {
 137     L1_Hit,
 138     LL_Hit,
 139     MemAccess,
 140     WriteBackMemAccess } CacheModelResult;
 141
 142 typedef CacheModelResult (*simcall_type)(Addr, UChar);
 143
 144 static struct {
 145     simcall_type I1_Read;
 146     simcall_type D1_Read;
 147     simcall_type D1_Write;
 148 } simulator;
 149
 150 /*------------------------------------------------------------*/
 151 /*--- Cache Simulator Initialization                       ---*/
 152 /*------------------------------------------------------------*/
 153
 154 static void cachesim_clearcache(cache_t2* c)
 155 {
 156   Int i;
 157
 158   for (i = 0; i < c->sets * c->assoc; i++)
 159     c->tags[i] = 0;
 160   if (c->use) {
 161     for (i = 0; i < c->sets * c->assoc; i++) {
 162       c->loaded[i].memline  = 0;
 163       c->loaded[i].use_base = 0;
 164       c->loaded[i].dep_use = 0;
 165       c->loaded[i].iaddr = 0;
 166       c->use[i].mask    = 0;
 167       c->use[i].count   = 0;
 168       c->tags[i] = i % c->assoc; /* init lower bits as pointer */
 169     }
 170   }
 171 }
 172
 173 static void cacheuse_initcache(cache_t2* c);
 174
 175 /* By this point, the size/assoc/line_size has been checked. */
 176 static void cachesim_initcache(cache_t config, cache_t2* c)
 177 {
 178    c->size      = config.size;
 179    c->assoc     = config.assoc;
 180    c->line_size = config.line_size;
 181    c->sectored  = False; // FIXME
 182
 183    c->sets           = (c->size / c->line_size) / c->assoc;
 184    c->sets_min_1     = c->sets - 1;
 185    c->line_size_bits = VG_(log2)(c->line_size);
 186    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
 187    c->tag_mask       = ~((1<<c->tag_shift)-1);
 188
 189    /* Can bits in tag entries be used for flags?
 190     * Should be always true as MIN_LINE_SIZE >= 16 */
 191    CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
 192
 193    if (c->assoc == 1) {
 194       VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
 195                    c->size, c->line_size,
 196                    c->sectored ? ", sectored":"");
 197    } else {
 198       VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
 199                    c->size, c->line_size, c->assoc,
 200                    c->sectored ? ", sectored":"");
 201    }
 202
 203    c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
 204                                  sizeof(UWord) * c->sets * c->assoc);
 205    if (clo_collect_cacheuse)
 206        cacheuse_initcache(c);
 207    else
 208      c->use = 0;
 209    cachesim_clearcache(c);
 210 }
 211
 212
 213 #if 0
 214 static void print_cache(cache_t2* c)
 215 {
 216    UInt set, way, i;
 217
 218    /* Note initialisation and update of 'i'. */
 219    for (i = 0, set = 0; set < c->sets; set++) {
 220       for (way = 0; way < c->assoc; way++, i++) {
 221          VG_(printf)("%8x ", c->tags[i]);
 222       }
 223       VG_(printf)("\n");
 224    }
 225 }
 226 #endif
 227
 228
 229 /*------------------------------------------------------------*/
 230 /*--- Write Through Cache Simulation                       ---*/
 231 /*------------------------------------------------------------*/
 232
 233 /*
 234  * Simple model: L1 & LL Write Through
 235  * Does not distinguish among read and write references
 236  *
 237  * Simulator functions:
 238  *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 239  *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 240  */
 241
 242 static __inline__
 243 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
 244 {
 245     int i, j;
 246     UWord *set;
 247
 248     set = &(c->tags[set_no * c->assoc]);
 249
 250     /* This loop is unrolled for just the first case, which is the most */
 251     /* common.  We can't unroll any further because it would screw up   */
 252     /* if we have a direct-mapped (1-way) cache.                        */
 253     if (tag == set[0])
 254         return Hit;
 255
 256     /* If the tag is one other than the MRU, move it into the MRU spot  */
 257     /* and shuffle the rest down.                                       */
 258     for (i = 1; i < c->assoc; i++) {
 259         if (tag == set[i]) {
 260             for (j = i; j > 0; j--) {
 261                 set[j] = set[j - 1];
 262             }
 263             set[0] = tag;
 264             return Hit;
 265         }
 266     }
 267
 268     /* A miss;  install this tag as MRU, shuffle rest down. */
 269     for (j = c->assoc - 1; j > 0; j--) {
 270         set[j] = set[j - 1];
 271     }
 272     set[0] = tag;
 273
 274     return Miss;
 275 }
 276
 277 static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
 278 {
 279     UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
 280     UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
 281     UWord tag  = a >> c->tag_shift;
 282
 283     /* Access entirely within line. */
 284     if (set1 == set2)
 285         return cachesim_setref(c, set1, tag);
 286
 287     /* Access straddles two lines. */
 288     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
 289     else if (((set1 + 1) & (c->sets-1)) == set2) {
 290         UWord tag2  = (a+size-1) >> c->tag_shift;
 291
 292         /* the call updates cache structures as side effect */
 293         CacheResult res1 =  cachesim_setref(c, set1, tag);
 294         CacheResult res2 =  cachesim_setref(c, set2, tag2);
 295         return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
 296
 297    } else {
 298        VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
 299        VG_(tool_panic)("item straddles more than two cache sets");
 300    }
 301    return Hit;
 302 }
 303
 304 static
 305 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 306 {
 307     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 308     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 309     return MemAccess;
 310 }
 311
 312 static
 313 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 314 {
 315     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 316     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 317     return MemAccess;
 318 }
 319
 320
 321 /*------------------------------------------------------------*/
 322 /*--- Write Back Cache Simulation                          ---*/
 323 /*------------------------------------------------------------*/
 324
 325 /*
 326  * More complex model: L1 Write-through, LL Write-back
 327  * This needs to distinguish among read and write references.
 328  *
 329  * Simulator functions:
 330  *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 331  *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 332  *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
 333  */
 334
 335 /*
 336  * With write-back, result can be a miss evicting a dirty line
 337  * The dirty state of a cache line is stored in Bit0 of the tag for
 338  * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
 339  * type (Read/Write), the line gets dirty on a write.
 340  */
 341 static __inline__
 342 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
 343 {
 344     int i, j;
 345     UWord *set, tmp_tag;
 346
 347     set = &(c->tags[set_no * c->assoc]);
 348
 349     /* This loop is unrolled for just the first case, which is the most */
 350     /* common.  We can't unroll any further because it would screw up   */
 351     /* if we have a direct-mapped (1-way) cache.                        */
 352     if (tag == (set[0] & ~CACHELINE_DIRTY)) {
 353         set[0] |= ref;
 354         return Hit;
 355     }
 356     /* If the tag is one other than the MRU, move it into the MRU spot  */
 357     /* and shuffle the rest down.                                       */
 358     for (i = 1; i < c->assoc; i++) {
 359         if (tag == (set[i] & ~CACHELINE_DIRTY)) {
 360             tmp_tag = set[i] | ref; // update dirty flag
 361             for (j = i; j > 0; j--) {
 362                 set[j] = set[j - 1];
 363             }
 364             set[0] = tmp_tag;
 365             return Hit;
 366         }
 367     }
 368
 369     /* A miss;  install this tag as MRU, shuffle rest down. */
 370     tmp_tag = set[c->assoc - 1];
 371     for (j = c->assoc - 1; j > 0; j--) {
 372         set[j] = set[j - 1];
 373     }
 374     set[0] = tag | ref;
 375
 376     return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
 377 }
 378
 379
 380 static __inline__
 381 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
 382 {
 383     UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
 384     UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
 385     UWord tag = a & c->tag_mask;
 386
 387     /* Access entirely within line. */
 388     if (set1 == set2)
 389         return cachesim_setref_wb(c, ref, set1, tag);
 390
 391     /* Access straddles two lines. */
 392     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
 393     else if (((set1 + 1) & (c->sets-1)) == set2) {
 394         UWord tag2  = (a+size-1) & c->tag_mask;
 395
 396         /* the call updates cache structures as side effect */
 397         CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
 398         CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
 399
 400         if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
 401         return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
 402
 403    } else {
 404        VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
 405        VG_(tool_panic)("item straddles more than two cache sets");
 406    }
 407    return Hit;
 408 }
 409
 410
 411 static
 412 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 413 {
 414     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 415     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 416         case Hit: return LL_Hit;
 417         case Miss: return MemAccess;
 418         default: break;
 419     }
 420     return WriteBackMemAccess;
 421 }
 422
 423 static
 424 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 425 {
 426     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 427     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 428         case Hit: return LL_Hit;
 429         case Miss: return MemAccess;
 430         default: break;
 431     }
 432     return WriteBackMemAccess;
 433 }
 434
 435 static
 436 CacheModelResult cachesim_D1_Write(Addr a, UChar size)
 437 {
 438     if ( cachesim_ref( &D1, a, size) == Hit ) {
 439         /* Even for a L1 hit, the write-trough L1 passes
 440          * the write to the LL to make the LL line dirty.
 441          * But this causes no latency, so return the hit.
 442          */
 443         cachesim_ref_wb( &LL, Write, a, size);
 444         return L1_Hit;
 445     }
 446     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
 447         case Hit: return LL_Hit;
 448         case Miss: return MemAccess;
 449         default: break;
 450     }
 451     return WriteBackMemAccess;
 452 }
 453
 454
 455 /*------------------------------------------------------------*/
 456 /*--- Hardware Prefetch Simulation                         ---*/
 457 /*------------------------------------------------------------*/
 458
 459 static ULong prefetch_up = 0;
 460 static ULong prefetch_down = 0;
 461
 462 #define PF_STREAMS  8
 463 #define PF_PAGEBITS 12
 464
 465 static UInt pf_lastblock[PF_STREAMS];
 466 static Int  pf_seqblocks[PF_STREAMS];
 467
 468 static
 469 void prefetch_clear(void)
 470 {
 471   int i;
 472   for(i=0;i<PF_STREAMS;i++)
 473     pf_lastblock[i] = pf_seqblocks[i] = 0;
 474 }
 475
 476 /*
 477  * HW Prefetch emulation
 478  * Start prefetching when detecting sequential access to 3 memory blocks.
 479  * One stream can be detected per 4k page.
 480  */
 481 static __inline__
 482 void prefetch_LL_doref(Addr a)
 483 {
 484   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
 485   UInt block = ( a >> LL.line_size_bits);
 486
 487   if (block != pf_lastblock[stream]) {
 488     if (pf_seqblocks[stream] == 0) {
 489       if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
 490       else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
 491     }
 492     else if (pf_seqblocks[stream] >0) {
 493       if (pf_lastblock[stream] +1 == block) {
 494         pf_seqblocks[stream]++;
 495         if (pf_seqblocks[stream] >= 2) {
 496           prefetch_up++;
 497           cachesim_ref(&LL, a + 5 * LL.line_size,1);
 498         }
 499       }
 500       else pf_seqblocks[stream] = 0;
 501     }
 502     else if (pf_seqblocks[stream] <0) {
 503       if (pf_lastblock[stream] -1 == block) {
 504         pf_seqblocks[stream]--;
 505         if (pf_seqblocks[stream] <= -2) {
 506           prefetch_down++;
 507           cachesim_ref(&LL, a - 5 * LL.line_size,1);
 508         }
 509       }
 510       else pf_seqblocks[stream] = 0;
 511     }
 512     pf_lastblock[stream] = block;
 513   }
 514 }
 515
 516 /* simple model with hardware prefetch */
 517
 518 static
 519 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
 520 {
 521     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 522     prefetch_LL_doref(a);
 523     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 524     return MemAccess;
 525 }
 526
 527 static
 528 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
 529 {
 530     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 531     prefetch_LL_doref(a);
 532     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 533     return MemAccess;
 534 }
 535
 536
 537 /* complex model with hardware prefetch */
 538
 539 static
 540 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
 541 {
 542     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 543     prefetch_LL_doref(a);
 544     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 545         case Hit: return LL_Hit;
 546         case Miss: return MemAccess;
 547         default: break;
 548     }
 549     return WriteBackMemAccess;
 550 }
 551
 552 static
 553 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
 554 {
 555     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 556     prefetch_LL_doref(a);
 557     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 558         case Hit: return LL_Hit;
 559         case Miss: return MemAccess;
 560         default: break;
 561     }
 562     return WriteBackMemAccess;
 563 }
 564
 565 static
 566 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
 567 {
 568     prefetch_LL_doref(a);
 569     if ( cachesim_ref( &D1, a, size) == Hit ) {
 570         /* Even for a L1 hit, the write-trough L1 passes
 571          * the write to the LL to make the LL line dirty.
 572          * But this causes no latency, so return the hit.
 573          */
 574         cachesim_ref_wb( &LL, Write, a, size);
 575         return L1_Hit;
 576     }
 577     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
 578         case Hit: return LL_Hit;
 579         case Miss: return MemAccess;
 580         default: break;
 581     }
 582     return WriteBackMemAccess;
 583 }
 584
 585
 586 /*------------------------------------------------------------*/
 587 /*--- Cache Simulation with use metric collection          ---*/
 588 /*------------------------------------------------------------*/
 589
 590 /* can not be combined with write-back or prefetch */
 591
 592 static
 593 void cacheuse_initcache(cache_t2* c)
 594 {
 595     int i;
 596     unsigned int start_mask, start_val;
 597     unsigned int end_mask, end_val;
 598
 599     c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
 600                            sizeof(line_use) * c->sets * c->assoc);
 601     c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
 602                            sizeof(line_loaded) * c->sets * c->assoc);
 603     c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
 604                                     sizeof(int) * c->line_size);
 605     c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
 606                                   sizeof(int) * c->line_size);
 607
 608     c->line_size_mask = c->line_size-1;
 609
 610     /* Meaning of line_start_mask/line_end_mask
 611      * Example: for a given cache line, you get an access starting at
 612      * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
 613      * line size of 32, you have 1 bit per byte in the mask:
 614      *
 615      *   bit31   bit8 bit5  bit 0
 616      *       |      |  |    |
 617      *       11..111111100000   line_start_mask[5]
 618      *       00..000111111111   line_end_mask[(5+4)-1]
 619      *
 620      *  use_mask |= line_start_mask[5] && line_end_mask[8]
 621      *
 622      */
 623     start_val = end_val = ~0;
 624     if (c->line_size < 32) {
 625         int bits_per_byte = 32/c->line_size;
 626         start_mask = (1<<bits_per_byte)-1;
 627         end_mask   = start_mask << (32-bits_per_byte);
 628         for(i=0;i<c->line_size;i++) {
 629             c->line_start_mask[i] = start_val;
 630             start_val  = start_val & ~start_mask;
 631             start_mask = start_mask << bits_per_byte;
 632
 633             c->line_end_mask[c->line_size-i-1] = end_val;
 634             end_val  = end_val & ~end_mask;
 635             end_mask = end_mask >> bits_per_byte;
 636         }
 637     }
 638     else {
 639         int bytes_per_bit = c->line_size/32;
 640         start_mask = 1;
 641         end_mask   = 1 << 31;
 642         for(i=0;i<c->line_size;i++) {
 643             c->line_start_mask[i] = start_val;
 644             c->line_end_mask[c->line_size-i-1] = end_val;
 645             if ( ((i+1)%bytes_per_bit) == 0) {
 646                 start_val   &= ~start_mask;
 647                 end_val     &= ~end_mask;
 648                 start_mask <<= 1;
 649                 end_mask   >>= 1;
 650             }
 651         }
 652     }
 653
 654     CLG_DEBUG(6, "Config %s:\n", c->desc_line);
 655     for(i=0;i<c->line_size;i++) {
 656         CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
 657                   i, c->line_start_mask[i], c->line_end_mask[i]);
 658     }
 659
 660     /* We use lower tag bits as offset pointers to cache use info.
 661      * I.e. some cache parameters don't work.
 662      */
 663     if ( (1<<c->tag_shift) < c->assoc) {
 664         VG_(message)(Vg_DebugMsg,
 665                      "error: Use associativity < %d for cache use statistics!\n",
 666                      (1<<c->tag_shift) );
 667         VG_(tool_panic)("Unsupported cache configuration");
 668     }
 669 }
 670
 671
 672 /* for I1/D1 caches */
 673 #define CACHEUSE(L)                                                         \
 674                                                                             \
 675 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
 676 {                                                                           \
 677    UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
 678    UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
 679    UWord tag  = a & L.tag_mask;                                             \
 680    UWord tag2;                                                              \
 681    int i, j, idx;                                                           \
 682    UWord *set, tmp_tag;                                                     \
 683    UInt use_mask;                                                           \
 684                                                                             \
 685    CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n",                  \
 686             L.name, a, size, set1, set2);                                   \
 687                                                                             \
 688    /* First case: word entirely within line. */                             \
 689    if (set1 == set2) {                                                      \
 690                                                                             \
 691       set = &(L.tags[set1 * L.assoc]);                                      \
 692       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
 693                  L.line_end_mask[(a+size-1) & L.line_size_mask];            \
 694                                                                             \
 695       /* This loop is unrolled for just the first case, which is the most */\
 696       /* common.  We can't unroll any further because it would screw up   */\
 697       /* if we have a direct-mapped (1-way) cache.                        */\
 698       if (tag == (set[0] & L.tag_mask)) {                                   \
 699         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
 700         L.use[idx].count ++;                                                \
 701         L.use[idx].mask |= use_mask;                                        \
 702         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 703                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 704                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 705         return L1_Hit;                                                      \
 706       }                                                                     \
 707       /* If the tag is one other than the MRU, move it into the MRU spot  */\
 708       /* and shuffle the rest down.                                       */\
 709       for (i = 1; i < L.assoc; i++) {                                       \
 710          if (tag == (set[i] & L.tag_mask)) {                                \
 711             tmp_tag = set[i];                                               \
 712             for (j = i; j > 0; j--) {                                       \
 713                set[j] = set[j - 1];                                         \
 714             }                                                               \
 715             set[0] = tmp_tag;                                               \
 716             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 717             L.use[idx].count ++;                                            \
 718             L.use[idx].mask |= use_mask;                                    \
 719         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 720                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 721                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 722             return L1_Hit;                                                  \
 723          }                                                                  \
 724       }                                                                     \
 725                                                                             \
 726       /* A miss;  install this tag as MRU, shuffle rest down. */            \
 727       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 728       for (j = L.assoc - 1; j > 0; j--) {                                   \
 729          set[j] = set[j - 1];                                               \
 730       }                                                                     \
 731       set[0] = tag | tmp_tag;                                               \
 732       idx = (set1 * L.assoc) + tmp_tag;                                     \
 733       return update_##L##_use(&L, idx,                                      \
 734                        use_mask, a &~ L.line_size_mask);                    \
 735                                                                             \
 736    /* Second case: word straddles two lines. */                             \
 737    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
 738    } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
 739       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
 740       set = &(L.tags[set1 * L.assoc]);                                      \
 741       use_mask = L.line_start_mask[a & L.line_size_mask];                   \
 742       if (tag == (set[0] & L.tag_mask)) {                                   \
 743          idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
 744          L.use[idx].count ++;                                               \
 745          L.use[idx].mask |= use_mask;                                       \
 746         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 747                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 748                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 749          goto block2;                                                       \
 750       }                                                                     \
 751       for (i = 1; i < L.assoc; i++) {                                       \
 752          if (tag == (set[i] & L.tag_mask)) {                                \
 753             tmp_tag = set[i];                                               \
 754             for (j = i; j > 0; j--) {                                       \
 755                set[j] = set[j - 1];                                         \
 756             }                                                               \
 757             set[0] = tmp_tag;                                               \
 758             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 759             L.use[idx].count ++;                                            \
 760             L.use[idx].mask |= use_mask;                                    \
 761         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 762                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 763                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 764             goto block2;                                                    \
 765          }                                                                  \
 766       }                                                                     \
 767       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 768       for (j = L.assoc - 1; j > 0; j--) {                                   \
 769          set[j] = set[j - 1];                                               \
 770       }                                                                     \
 771       set[0] = tag | tmp_tag;                                               \
 772       idx = (set1 * L.assoc) + tmp_tag;                                     \
 773       miss1 = update_##L##_use(&L, idx,                                     \
 774                        use_mask, a &~ L.line_size_mask);                    \
 775 block2:                                                                     \
 776       set = &(L.tags[set2 * L.assoc]);                                      \
 777       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];            \
 778       tag2  = (a+size-1) & L.tag_mask;                                      \
 779       if (tag2 == (set[0] & L.tag_mask)) {                                  \
 780          idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
 781          L.use[idx].count ++;                                               \
 782          L.use[idx].mask |= use_mask;                                       \
 783         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 784                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 785                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 786          return miss1;                                                      \
 787       }                                                                     \
 788       for (i = 1; i < L.assoc; i++) {                                       \
 789          if (tag2 == (set[i] & L.tag_mask)) {                               \
 790             tmp_tag = set[i];                                               \
 791             for (j = i; j > 0; j--) {                                       \
 792                set[j] = set[j - 1];                                         \
 793             }                                                               \
 794             set[0] = tmp_tag;                                               \
 795             idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 796             L.use[idx].count ++;                                            \
 797             L.use[idx].mask |= use_mask;                                    \
 798         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 799                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 800                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 801             return miss1;                                                   \
 802          }                                                                  \
 803       }                                                                     \
 804       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 805       for (j = L.assoc - 1; j > 0; j--) {                                   \
 806          set[j] = set[j - 1];                                               \
 807       }                                                                     \
 808       set[0] = tag2 | tmp_tag;                                              \
 809       idx = (set2 * L.assoc) + tmp_tag;                                     \
 810       miss2 = update_##L##_use(&L, idx,                                     \
 811                        use_mask, (a+size-1) &~ L.line_size_mask);           \
 812       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
 813                                                                             \
 814    } else {                                                                 \
 815        VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
 816        VG_(tool_panic)("item straddles more than two cache sets");          \
 817    }                                                                        \
 818    return 0;                                                                \
 819 }
 820
 821
 822 /* logarithmic bitcounting algorithm, see
 823  * http://graphics.stanford.edu/~seander/bithacks.html
 824  */
 825 static __inline__ unsigned int countBits(unsigned int bits)
 826 {
 827   unsigned int c; // store the total here
 828   const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
 829   const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
 830
 831   c = bits;
 832   c = ((c >> S[0]) & B[0]) + (c & B[0]);
 833   c = ((c >> S[1]) & B[1]) + (c & B[1]);
 834   c = ((c >> S[2]) & B[2]) + (c & B[2]);
 835   c = ((c >> S[3]) & B[3]) + (c & B[3]);
 836   c = ((c >> S[4]) & B[4]) + (c & B[4]);
 837   return c;
 838 }
 839
 840 static void update_LL_use(int idx, Addr memline)
 841 {
 842   line_loaded* loaded = &(LL.loaded[idx]);
 843   line_use* use = &(LL.use[idx]);
 844   int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
 845
 846   CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
 847            idx, CLG_(bb_base) + current_ii->instr_offset, memline);
 848   if (use->count>0) {
 849     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
 850              use->count, i, use->mask, loaded->memline, loaded->iaddr);
 851     CLG_DEBUG(2, "   collect: %d, use_base %p\n",
 852              CLG_(current_state).collect, loaded->use_base);
 853
 854     if (CLG_(current_state).collect && loaded->use_base) {
 855       (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
 856       (loaded->use_base)[off_LL_SpLoss] += i;
 857     }
 858    }
 859
 860    use->count = 0;
 861    use->mask  = 0;
 862
 863   loaded->memline = memline;
 864   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;
 865   loaded->use_base = (CLG_(current_state).nonskipped) ?
 866     CLG_(current_state).nonskipped->skipped :
 867     CLG_(cost_base) + current_ii->cost_offset;
 868 }
 869
 870 static
 871 CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
 872 {
 873    UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
 874    UWord* set = &(LL.tags[setNo * LL.assoc]);
 875    UWord tag  = memline & LL.tag_mask;
 876
 877    int i, j, idx;
 878    UWord tmp_tag;
 879
 880    CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
 881
 882    if (tag == (set[0] & LL.tag_mask)) {
 883      idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
 884      l1_loaded->dep_use = &(LL.use[idx]);
 885
 886      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
 887                  idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
 888                  LL.use[idx].mask, LL.use[idx].count);
 889      return LL_Hit;
 890    }
 891    for (i = 1; i < LL.assoc; i++) {
 892      if (tag == (set[i] & LL.tag_mask)) {
 893        tmp_tag = set[i];
 894        for (j = i; j > 0; j--) {
 895          set[j] = set[j - 1];
 896        }
 897        set[0] = tmp_tag;
 898        idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
 899        l1_loaded->dep_use = &(LL.use[idx]);
 900
 901         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
 902                  i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
 903                  LL.use[idx].mask, LL.use[idx].count);
 904         return LL_Hit;
 905      }
 906    }
 907
 908    /* A miss;  install this tag as MRU, shuffle rest down. */
 909    tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
 910    for (j = LL.assoc - 1; j > 0; j--) {
 911      set[j] = set[j - 1];
 912    }
 913    set[0] = tag | tmp_tag;
 914    idx = (setNo * LL.assoc) + tmp_tag;
 915    l1_loaded->dep_use = &(LL.use[idx]);
 916
 917    update_LL_use(idx, memline);
 918
 919    return MemAccess;
 920 }
 921
 922
 923
 924
 925 #define UPDATE_USE(L)                                                \
 926                                                                      \
 927 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
 928                                UInt mask, Addr memline)              \
 929 {                                                                    \
 930   line_loaded* loaded = &(cache->loaded[idx]);                       \
 931   line_use* use = &(cache->use[idx]);                                \
 932   int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
 933                                                                      \
 934   CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
 935            cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
 936   if (use->count>0) {                                                \
 937     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
 938              use->count, c, use->mask, loaded->memline, loaded->iaddr); \
 939     CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
 940              CLG_(current_state).collect, loaded->use_base);         \
 941                                                                      \
 942     if (CLG_(current_state).collect && loaded->use_base) {           \
 943       (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
 944       (loaded->use_base)[off_##L##_SpLoss] += c;                     \
 945                                                                      \
 946       /* FIXME (?): L1/LL line sizes must be equal ! */              \
 947       loaded->dep_use->mask |= use->mask;                            \
 948       loaded->dep_use->count += use->count;                          \
 949     }                                                                \
 950   }                                                                  \
 951                                                                      \
 952   use->count = 1;                                                    \
 953   use->mask  = mask;                                                 \
 954   loaded->memline = memline;                                         \
 955   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;        \
 956   loaded->use_base = (CLG_(current_state).nonskipped) ?              \
 957     CLG_(current_state).nonskipped->skipped :                        \
 958     CLG_(cost_base) + current_ii->cost_offset;                       \
 959                                                                      \
 960   if (memline == 0) return LL_Hit;                                   \
 961   return cacheuse_LL_access(memline, loaded);                        \
 962 }
 963
 964 UPDATE_USE(I1);
 965 UPDATE_USE(D1);
 966
 967 CACHEUSE(I1);
 968 CACHEUSE(D1);
 969
 970
 971 static
 972 void cacheuse_finish(void)
 973 {
 974   int i;
 975   InstrInfo ii = { 0,0,0,0 };
 976
 977   if (!CLG_(current_state).collect) return;
 978
 979   CLG_(bb_base) = 0;
 980   current_ii = &ii; /* needs to be set for update_XX_use */
 981   CLG_(cost_base) = 0;
 982
 983   /* update usage counters */
 984   if (I1.use)
 985     for (i = 0; i < I1.sets * I1.assoc; i++)
 986       if (I1.loaded[i].use_base)
 987         update_I1_use( &I1, i, 0,0);
 988
 989   if (D1.use)
 990     for (i = 0; i < D1.sets * D1.assoc; i++)
 991       if (D1.loaded[i].use_base)
 992         update_D1_use( &D1, i, 0,0);
 993
 994   if (LL.use)
 995     for (i = 0; i < LL.sets * LL.assoc; i++)
 996       if (LL.loaded[i].use_base)
 997         update_LL_use(i, 0);
 998
 999   current_ii = 0;
1000 }
1001
1002
1003
1004 /*------------------------------------------------------------*/
1005 /*--- Helper functions called by instrumented code         ---*/
1006 /*------------------------------------------------------------*/
1007
1008
1009 static __inline__
1010 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1011 {
1012     switch(r) {
1013         case WriteBackMemAccess:
1014             if (clo_simulate_writeback) {
1015                 c1[3]++;
1016                 c2[3]++;
1017             }
1018             // fall through
1019
1020         case MemAccess:
1021             c1[2]++;
1022             c2[2]++;
1023             // fall through
1024
1025         case LL_Hit:
1026             c1[1]++;
1027             c2[1]++;
1028             // fall through
1029
1030         default:
1031             c1[0]++;
1032             c2[0]++;
1033     }
1034 }
1035
1036 static
1037 Char* cacheRes(CacheModelResult r)
1038 {
1039     switch(r) {
1040     case L1_Hit:    return "L1 Hit ";
1041     case LL_Hit:    return "LL Hit ";
1042     case MemAccess: return "LL Miss";
1043     case WriteBackMemAccess: return "LL Miss (dirty)";
1044     default:
1045         tl_assert(0);
1046     }
1047     return "??";
1048 }
1049
1050 VG_REGPARM(1)
1051 static void log_1I0D(InstrInfo* ii)
1052 {
1053     CacheModelResult IrRes;
1054
1055     current_ii = ii;
1056     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1057
1058     CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
1059               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
1060
1061     if (CLG_(current_state).collect) {
1062         ULong* cost_Ir;
1063
1064         if (CLG_(current_state).nonskipped)
1065             cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1066         else
1067             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1068
1069         inc_costs(IrRes, cost_Ir,
1070                   CLG_(current_state).cost + fullOffset(EG_IR) );
1071     }
1072 }
1073
1074 VG_REGPARM(2)
1075 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
1076 {
1077     CacheModelResult Ir1Res, Ir2Res;
1078     ULong *global_cost_Ir;
1079
1080     current_ii = ii1;
1081     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1082     current_ii = ii2;
1083     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1084
1085     CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1086               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1087               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
1088
1089     if (!CLG_(current_state).collect) return;
1090
1091     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1092     if (CLG_(current_state).nonskipped) {
1093         ULong* skipped_cost_Ir =
1094             CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1095
1096         inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1097         inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1098         return;
1099     }
1100
1101     inc_costs(Ir1Res, global_cost_Ir,
1102               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1103     inc_costs(Ir2Res, global_cost_Ir,
1104               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1105 }
1106
1107 VG_REGPARM(3)
1108 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
1109 {
1110     CacheModelResult Ir1Res, Ir2Res, Ir3Res;
1111     ULong *global_cost_Ir;
1112
1113     current_ii = ii1;
1114     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1115     current_ii = ii2;
1116     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1117     current_ii = ii3;
1118     Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
1119
1120     CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1121               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1122               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
1123               CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
1124
1125     if (!CLG_(current_state).collect) return;
1126
1127     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1128     if (CLG_(current_state).nonskipped) {
1129         ULong* skipped_cost_Ir =
1130             CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1131         inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1132         inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1133         inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
1134         return;
1135     }
1136
1137     inc_costs(Ir1Res, global_cost_Ir,
1138               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1139     inc_costs(Ir2Res, global_cost_Ir,
1140               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1141     inc_costs(Ir3Res, global_cost_Ir,
1142               CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
1143 }
1144
1145 /* Instruction doing a read access */
1146
1147 VG_REGPARM(3)
1148 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1149 {
1150     CacheModelResult IrRes, DrRes;
1151
1152     current_ii = ii;
1153     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1154     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1155
1156     CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
1157               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1158               data_addr, data_size, cacheRes(DrRes));
1159
1160     if (CLG_(current_state).collect) {
1161         ULong *cost_Ir, *cost_Dr;
1162
1163         if (CLG_(current_state).nonskipped) {
1164             cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1165             cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1166         }
1167         else {
1168             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1169             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1170         }
1171
1172         inc_costs(IrRes, cost_Ir,
1173                   CLG_(current_state).cost + fullOffset(EG_IR) );
1174         inc_costs(DrRes, cost_Dr,
1175                   CLG_(current_state).cost + fullOffset(EG_DR) );
1176     }
1177 }
1178
1179
1180 VG_REGPARM(3)
1181 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1182 {
1183     CacheModelResult DrRes;
1184
1185     current_ii = ii;
1186     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1187
1188     CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
1189               data_addr, data_size, cacheRes(DrRes));
1190
1191     if (CLG_(current_state).collect) {
1192         ULong *cost_Dr;
1193
1194         if (CLG_(current_state).nonskipped)
1195             cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1196         else
1197             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1198
1199         inc_costs(DrRes, cost_Dr,
1200                   CLG_(current_state).cost + fullOffset(EG_DR) );
1201     }
1202 }
1203
1204
1205 /* Instruction doing a write access */
1206
1207 VG_REGPARM(3)
1208 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1209 {
1210     CacheModelResult IrRes, DwRes;
1211
1212     current_ii = ii;
1213     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1214     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1215
1216     CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
1217               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1218               data_addr, data_size, cacheRes(DwRes));
1219
1220     if (CLG_(current_state).collect) {
1221         ULong *cost_Ir, *cost_Dw;
1222
1223         if (CLG_(current_state).nonskipped) {
1224             cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1225             cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1226         }
1227         else {
1228             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1229             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1230         }
1231
1232         inc_costs(IrRes, cost_Ir,
1233                   CLG_(current_state).cost + fullOffset(EG_IR) );
1234         inc_costs(DwRes, cost_Dw,
1235                   CLG_(current_state).cost + fullOffset(EG_DW) );
1236     }
1237 }
1238
1239 VG_REGPARM(3)
1240 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1241 {
1242     CacheModelResult DwRes;
1243
1244     current_ii = ii;
1245     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1246
1247     CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
1248               data_addr, data_size, cacheRes(DwRes));
1249
1250     if (CLG_(current_state).collect) {
1251         ULong *cost_Dw;
1252
1253         if (CLG_(current_state).nonskipped)
1254             cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1255         else
1256             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1257
1258         inc_costs(DwRes, cost_Dw,
1259                   CLG_(current_state).cost + fullOffset(EG_DW) );
1260     }
1261 }
1262
1263
1264
1265 /*------------------------------------------------------------*/
1266 /*--- Cache configuration                                  ---*/
1267 /*------------------------------------------------------------*/
1268
1269 #define UNDEFINED_CACHE     ((cache_t) { -1, -1, -1 })
1270
1271 static cache_t clo_I1_cache = UNDEFINED_CACHE;
1272 static cache_t clo_D1_cache = UNDEFINED_CACHE;
1273 static cache_t clo_LL_cache = UNDEFINED_CACHE;
1274
1275
1276 // Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
1277 // string otherwise.
1278 static Char* check_cache(cache_t* cache)
1279 {
1280    // Simulator requires line size and set count to be powers of two.
1281    if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
1282        (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
1283    {
1284       return "Cache set count is not a power of two.\n";
1285    }
1286
1287    // Simulator requires line size to be a power of two.
1288    if (-1 == VG_(log2)(cache->line_size)) {
1289       return "Cache line size is not a power of two.\n";
1290    }
1291
1292    // Then check line size >= 16 -- any smaller and a single instruction could
1293    // straddle three cache lines, which breaks a simulation assertion and is
1294    // stupid anyway.
1295    if (cache->line_size < MIN_LINE_SIZE) {
1296       return "Cache line size is too small.\n";
1297    }
1298
1299    /* Then check cache size > line size (causes seg faults if not). */
1300    if (cache->size <= cache->line_size) {
1301       return "Cache size <= line size.\n";
1302    }
1303
1304    /* Then check assoc <= (size / line size) (seg faults otherwise). */
1305    if (cache->assoc > (cache->size / cache->line_size)) {
1306       return "Cache associativity > (size / line size).\n";
1307    }
1308
1309    return NULL;
1310 }
1311
1312 static
1313 void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
1314 {
1315 #define DEFINED(L)   (-1 != L.size  || -1 != L.assoc || -1 != L.line_size)
1316
1317    Char* checkRes;
1318
1319    Bool all_caches_clo_defined =
1320       (DEFINED(clo_I1_cache) &&
1321        DEFINED(clo_D1_cache) &&
1322        DEFINED(clo_LL_cache));
1323
1324    // Set the cache config (using auto-detection, if supported by the
1325    // architecture).
1326    VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
1327
1328    // Check the default/auto-detected values.
1329    checkRes = check_cache(I1c);  tl_assert(!checkRes);
1330    checkRes = check_cache(D1c);  tl_assert(!checkRes);
1331    checkRes = check_cache(LLc);  tl_assert(!checkRes);
1332
1333    // Then replace with any defined on the command line.
1334    if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1335    if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1336    if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
1337
1338    if (VG_(clo_verbosity) > 1) {
1339       VG_(umsg)("Cache configuration used:\n");
1340       VG_(umsg)("  I1: %dB, %d-way, %dB lines\n",
1341                 I1c->size, I1c->assoc, I1c->line_size);
1342       VG_(umsg)("  D1: %dB, %d-way, %dB lines\n",
1343                 D1c->size, D1c->assoc, D1c->line_size);
1344       VG_(umsg)("  LL: %dB, %d-way, %dB lines\n",
1345                 LLc->size, LLc->assoc, LLc->line_size);
1346    }
1347 #undef CMD_LINE_DEFINED
1348 }
1349
1350
1351 /* Initialize and clear simulator state */
1352 static void cachesim_post_clo_init(void)
1353 {
1354   /* Cache configurations. */
1355   cache_t  I1c, D1c, LLc;
1356
1357   /* Initialize access handlers */
1358   if (!CLG_(clo).simulate_cache) {
1359     CLG_(cachesim).log_1I0D  = 0;
1360     CLG_(cachesim).log_1I0D_name = "(no function)";
1361     CLG_(cachesim).log_2I0D  = 0;
1362     CLG_(cachesim).log_2I0D_name = "(no function)";
1363     CLG_(cachesim).log_3I0D  = 0;
1364     CLG_(cachesim).log_3I0D_name = "(no function)";
1365
1366     CLG_(cachesim).log_1I1Dr = 0;
1367     CLG_(cachesim).log_1I1Dr_name = "(no function)";
1368     CLG_(cachesim).log_1I1Dw = 0;
1369     CLG_(cachesim).log_1I1Dw_name = "(no function)";
1370
1371     CLG_(cachesim).log_0I1Dr = 0;
1372     CLG_(cachesim).log_0I1Dr_name = "(no function)";
1373     CLG_(cachesim).log_0I1Dw = 0;
1374     CLG_(cachesim).log_0I1Dw_name = "(no function)";
1375     return;
1376   }
1377
1378   /* Configuration of caches only needed with real cache simulation */
1379   configure_caches(&I1c, &D1c, &LLc);
1380
1381   I1.name = "I1";
1382   D1.name = "D1";
1383   LL.name = "LL";
1384
1385   cachesim_initcache(I1c, &I1);
1386   cachesim_initcache(D1c, &D1);
1387   cachesim_initcache(LLc, &LL);
1388
1389   /* the other cache simulators use the standard helpers
1390    * with dispatching via simulator struct */
1391
1392   CLG_(cachesim).log_1I0D  = log_1I0D;
1393   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
1394   CLG_(cachesim).log_2I0D  = log_2I0D;
1395   CLG_(cachesim).log_2I0D_name  = "log_2I0D";
1396   CLG_(cachesim).log_3I0D  = log_3I0D;
1397   CLG_(cachesim).log_3I0D_name  = "log_3I0D";
1398
1399   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1400   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1401   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1402   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1403
1404   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1405   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1406   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1407   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1408
1409   if (clo_collect_cacheuse) {
1410
1411       /* Output warning for not supported option combinations */
1412       if (clo_simulate_hwpref) {
1413           VG_(message)(Vg_DebugMsg,
1414                        "warning: prefetch simulation can not be "
1415                        "used with cache usage\n");
1416           clo_simulate_hwpref = False;
1417       }
1418
1419       if (clo_simulate_writeback) {
1420           VG_(message)(Vg_DebugMsg,
1421                        "warning: write-back simulation can not be "
1422                        "used with cache usage\n");
1423           clo_simulate_writeback = False;
1424       }
1425
1426       simulator.I1_Read  = cacheuse_I1_doRead;
1427       simulator.D1_Read  = cacheuse_D1_doRead;
1428       simulator.D1_Write = cacheuse_D1_doRead;
1429       return;
1430   }
1431
1432   if (clo_simulate_hwpref) {
1433     prefetch_clear();
1434
1435     if (clo_simulate_writeback) {
1436       simulator.I1_Read  = prefetch_I1_Read;
1437       simulator.D1_Read  = prefetch_D1_Read;
1438       simulator.D1_Write = prefetch_D1_Write;
1439     }
1440     else {
1441       simulator.I1_Read  = prefetch_I1_ref;
1442       simulator.D1_Read  = prefetch_D1_ref;
1443       simulator.D1_Write = prefetch_D1_ref;
1444     }
1445
1446     return;
1447   }
1448
1449   if (clo_simulate_writeback) {
1450       simulator.I1_Read  = cachesim_I1_Read;
1451       simulator.D1_Read  = cachesim_D1_Read;
1452       simulator.D1_Write = cachesim_D1_Write;
1453   }
1454   else {
1455       simulator.I1_Read  = cachesim_I1_ref;
1456       simulator.D1_Read  = cachesim_D1_ref;
1457       simulator.D1_Write = cachesim_D1_ref;
1458   }
1459 }
1460
1461
1462 /* Clear simulator state. Has to be initialized before */
1463 static
1464 void cachesim_clear(void)
1465 {
1466   cachesim_clearcache(&I1);
1467   cachesim_clearcache(&D1);
1468   cachesim_clearcache(&LL);
1469
1470   prefetch_clear();
1471 }
1472
1473
1474 static void cachesim_getdesc(Char* buf)
1475 {
1476   Int p;
1477   p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1478   p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1479   VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
1480 }
1481
1482 static
1483 void cachesim_print_opts(void)
1484 {
1485   VG_(printf)(
1486 "\n   cache simulator options (does cache simulation if used):\n"
1487 "    --simulate-wb=no|yes      Count write-back events [no]\n"
1488 "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
1489 #if CLG_EXPERIMENTAL
1490 "    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1491 #endif
1492 "    --cacheuse=no|yes         Collect cache block use [no]\n"
1493 "    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
1494 "    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
1495 "    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
1496               );
1497 }
1498
1499 static void parse_opt ( cache_t* cache,
1500                         char* opt, Char* optval, UChar kind )
1501 {
1502    Long i1, i2, i3;
1503    Char* endptr;
1504    Char* checkRes;
1505
1506    // Option argument looks like "65536,2,64".  Extract them.
1507    i1 = VG_(strtoll10)(optval,   &endptr); if (*endptr != ',')  goto bad;
1508    i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',')  goto bad;
1509    i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
1510
1511    // Check for overflow.
1512    cache->size      = (Int)i1;
1513    cache->assoc     = (Int)i2;
1514    cache->line_size = (Int)i3;
1515    if (cache->size      != i1) goto overflow;
1516    if (cache->assoc     != i2) goto overflow;
1517    if (cache->line_size != i3) goto overflow;
1518
1519    checkRes = check_cache(cache);
1520    if (checkRes) {
1521       VG_(fmsg)("%s", checkRes);
1522       goto bad;
1523    }
1524
1525    return;
1526
1527   bad:
1528    VG_(fmsg_bad_option)(opt, "");
1529
1530   overflow:
1531    VG_(fmsg_bad_option)(opt,
1532       "One of the cache parameters was too large and overflowed.\n");
1533 }
1534
1535 /* Check for command line option for cache configuration.
1536  * Return False if unknown and not handled.
1537  *
1538  * Called from CLG_(process_cmd_line_option)() in clo.c
1539  */
1540 static Bool cachesim_parse_opt(Char* arg)
1541 {
1542    Char* tmp_str;
1543
1544    if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
1545    else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
1546    else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
1547
1548    else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1549       if (clo_collect_cacheuse) {
1550          /* Use counters only make sense with fine dumping */
1551          CLG_(clo).dump_instr = True;
1552       }
1553    }
1554
1555    else if VG_STR_CLO(arg, "--I1", tmp_str)
1556       parse_opt(&clo_I1_cache, arg, tmp_str, 'i');
1557    else if VG_STR_CLO(arg, "--D1", tmp_str)
1558       parse_opt(&clo_D1_cache, arg, tmp_str, '1');
1559    else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
1560             VG_STR_CLO(arg, "--LL", tmp_str))
1561       parse_opt(&clo_LL_cache, arg, tmp_str, '2');
1562   else
1563     return False;
1564
1565   return True;
1566 }
1567
1568 /* Adds commas to ULong, right justifying in a field field_width wide, returns
1569  * the string in buf. */
1570 static
1571 Int commify(ULong n, int field_width, char* buf)
1572 {
1573    int len, n_commas, i, j, new_len, space;
1574
1575    VG_(sprintf)(buf, "%llu", n);
1576    len = VG_(strlen)(buf);
1577    n_commas = (len - 1) / 3;
1578    new_len = len + n_commas;
1579    space = field_width - new_len;
1580
1581    /* Allow for printing a number in a field_width smaller than it's size */
1582    if (space < 0) space = 0;
1583
1584    /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1585     * of three. */
1586    for (j = -1, i = len ; i >= 0; i--) {
1587       buf[i + n_commas + space] = buf[i];
1588
1589       if ((i>0) && (3 == ++j)) {
1590          j = 0;
1591          n_commas--;
1592          buf[i + n_commas + space] = ',';
1593       }
1594    }
1595    /* Right justify in field. */
1596    for (i = 0; i < space; i++)  buf[i] = ' ';
1597    return new_len;
1598 }
1599
1600 static
1601 void percentify(Int n, Int ex, Int field_width, char buf[])
1602 {
1603    int i, len, space;
1604
1605    VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1606    len = VG_(strlen)(buf);
1607    space = field_width - len;
1608    if (space < 0) space = 0;     /* Allow for v. small field_width */
1609    i = len;
1610
1611    /* Right justify in field */
1612    for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
1613    for (i = 0; i < space; i++)  buf[i] = ' ';
1614 }
1615
1616 static
1617 void cachesim_printstat(Int l1, Int l2, Int l3)
1618 {
1619   FullCost total = CLG_(total_cost), D_total = 0;
1620   ULong LL_total_m, LL_total_mr, LL_total_mw,
1621     LL_total, LL_total_r, LL_total_w;
1622   char buf1[RESULTS_BUF_LEN],
1623     buf2[RESULTS_BUF_LEN],
1624     buf3[RESULTS_BUF_LEN];
1625   Int p;
1626
1627   if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1628     VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n",
1629                  prefetch_up);
1630     VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n",
1631                  prefetch_down);
1632     VG_(message)(Vg_DebugMsg, "\n");
1633   }
1634
1635   commify(total[fullOffset(EG_IR) +1], l1, buf1);
1636   VG_(message)(Vg_UserMsg, "I1  misses:    %s\n", buf1);
1637
1638   commify(total[fullOffset(EG_IR) +2], l1, buf1);
1639   VG_(message)(Vg_UserMsg, "LLi misses:    %s\n", buf1);
1640
1641   p = 100;
1642
1643   if (0 == total[fullOffset(EG_IR)])
1644     total[fullOffset(EG_IR)] = 1;
1645
1646   percentify(total[fullOffset(EG_IR)+1] * 100 * p /
1647              total[fullOffset(EG_IR)], p, l1+1, buf1);
1648   VG_(message)(Vg_UserMsg, "I1  miss rate: %s\n", buf1);
1649
1650   percentify(total[fullOffset(EG_IR)+2] * 100 * p /
1651              total[fullOffset(EG_IR)], p, l1+1, buf1);
1652   VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
1653   VG_(message)(Vg_UserMsg, "\n");
1654
1655   /* D cache results.
1656      Use the D_refs.rd and D_refs.wr values to determine the
1657    * width of columns 2 & 3. */
1658
1659   D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1660   CLG_(init_cost)( CLG_(sets).full, D_total);
1661   // we only use the first 3 values of D_total, adding up Dr and Dw costs
1662   CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
1663   CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
1664
1665   commify( D_total[0], l1, buf1);
1666   commify(total[fullOffset(EG_DR)], l2,  buf2);
1667   commify(total[fullOffset(EG_DW)], l3,  buf3);
1668   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)\n",
1669                buf1,  buf2,  buf3);
1670
1671   commify( D_total[1], l1, buf1);
1672   commify(total[fullOffset(EG_DR)+1], l2, buf2);
1673   commify(total[fullOffset(EG_DW)+1], l3, buf3);
1674   VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)\n",
1675                buf1, buf2, buf3);
1676
1677   commify( D_total[2], l1, buf1);
1678   commify(total[fullOffset(EG_DR)+2], l2, buf2);
1679   commify(total[fullOffset(EG_DW)+2], l3, buf3);
1680   VG_(message)(Vg_UserMsg, "LLd misses:    %s  (%s rd + %s wr)\n",
1681                buf1, buf2, buf3);
1682
1683   p = 10;
1684
1685   if (0 == D_total[0])   D_total[0] = 1;
1686   if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
1687   if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
1688
1689   percentify( D_total[1] * 100 * p / D_total[0],  p, l1+1, buf1);
1690   percentify(total[fullOffset(EG_DR)+1] * 100 * p /
1691              total[fullOffset(EG_DR)], p, l2+1, buf2);
1692   percentify(total[fullOffset(EG_DW)+1] * 100 * p /
1693              total[fullOffset(EG_DW)], p, l3+1, buf3);
1694   VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )\n",
1695                buf1, buf2,buf3);
1696
1697   percentify( D_total[2] * 100 * p / D_total[0],  p, l1+1, buf1);
1698   percentify(total[fullOffset(EG_DR)+2] * 100 * p /
1699              total[fullOffset(EG_DR)], p, l2+1, buf2);
1700   percentify(total[fullOffset(EG_DW)+2] * 100 * p /
1701              total[fullOffset(EG_DW)], p, l3+1, buf3);
1702   VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s   + %s  )\n",
1703                buf1, buf2,buf3);
1704   VG_(message)(Vg_UserMsg, "\n");
1705
1706
1707
1708   /* LL overall results */
1709
1710   LL_total   =
1711     total[fullOffset(EG_DR) +1] +
1712     total[fullOffset(EG_DW) +1] +
1713     total[fullOffset(EG_IR) +1];
1714   LL_total_r =
1715     total[fullOffset(EG_DR) +1] +
1716     total[fullOffset(EG_IR) +1];
1717   LL_total_w = total[fullOffset(EG_DW) +1];
1718   commify(LL_total,   l1, buf1);
1719   commify(LL_total_r, l2, buf2);
1720   commify(LL_total_w, l3, buf3);
1721   VG_(message)(Vg_UserMsg, "LL refs:       %s  (%s rd + %s wr)\n",
1722                buf1, buf2, buf3);
1723
1724   LL_total_m  =
1725     total[fullOffset(EG_DR) +2] +
1726     total[fullOffset(EG_DW) +2] +
1727     total[fullOffset(EG_IR) +2];
1728   LL_total_mr =
1729     total[fullOffset(EG_DR) +2] +
1730     total[fullOffset(EG_IR) +2];
1731   LL_total_mw = total[fullOffset(EG_DW) +2];
1732   commify(LL_total_m,  l1, buf1);
1733   commify(LL_total_mr, l2, buf2);
1734   commify(LL_total_mw, l3, buf3);
1735   VG_(message)(Vg_UserMsg, "LL misses:     %s  (%s rd + %s wr)\n",
1736                buf1, buf2, buf3);
1737
1738   percentify(LL_total_m  * 100 * p /
1739              (total[fullOffset(EG_IR)] + D_total[0]),  p, l1+1, buf1);
1740   percentify(LL_total_mr * 100 * p /
1741              (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
1742              p, l2+1, buf2);
1743   percentify(LL_total_mw * 100 * p /
1744              total[fullOffset(EG_DW)], p, l3+1, buf3);
1745   VG_(message)(Vg_UserMsg, "LL miss rate:  %s (%s   + %s  )\n",
1746                buf1, buf2,buf3);
1747 }
1748
1749
1750 /*------------------------------------------------------------*/
1751 /*--- Setup for Event set.                                 ---*/
1752 /*------------------------------------------------------------*/
1753
1754 struct event_sets CLG_(sets);
1755
1756 void CLG_(init_eventsets)()
1757 {
1758     // Event groups from which the event sets are composed
1759     // the "Use" group only is used with "cacheuse" simulation
1760     if (clo_collect_cacheuse)
1761         CLG_(register_event_group4)(EG_USE,
1762                                     "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1763
1764     if (!CLG_(clo).simulate_cache)
1765         CLG_(register_event_group)(EG_IR, "Ir");
1766     else if (!clo_simulate_writeback) {
1767         CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
1768         CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
1769         CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
1770     }
1771     else { // clo_simulate_writeback
1772         CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
1773         CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
1774         CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
1775     }
1776
1777     if (CLG_(clo).simulate_branch) {
1778         CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
1779         CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
1780     }
1781
1782     if (CLG_(clo).collect_bus)
1783         CLG_(register_event_group)(EG_BUS, "Ge");
1784
1785     if (CLG_(clo).collect_alloc)
1786         CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
1787
1788     if (CLG_(clo).collect_systime)
1789         CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
1790
1791     // event set used as base for instruction self cost
1792     CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
1793
1794     // event set comprising all event groups, used for inclusive cost
1795     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
1796     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
1797     CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
1798     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
1799
1800     CLG_DEBUGIF(1) {
1801         CLG_DEBUG(1, "EventSets:\n");
1802         CLG_(print_eventset)(-2, CLG_(sets).base);
1803         CLG_(print_eventset)(-2, CLG_(sets).full);
1804     }
1805
1806     /* Not-existing events are silently ignored */
1807     CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
1808     CLG_(append_event)(CLG_(dumpmap), "Ir");
1809     CLG_(append_event)(CLG_(dumpmap), "Dr");
1810     CLG_(append_event)(CLG_(dumpmap), "Dw");
1811     CLG_(append_event)(CLG_(dumpmap), "I1mr");
1812     CLG_(append_event)(CLG_(dumpmap), "D1mr");
1813     CLG_(append_event)(CLG_(dumpmap), "D1mw");
1814     CLG_(append_event)(CLG_(dumpmap), "ILmr");
1815     CLG_(append_event)(CLG_(dumpmap), "DLmr");
1816     CLG_(append_event)(CLG_(dumpmap), "DLmw");
1817     CLG_(append_event)(CLG_(dumpmap), "ILdmr");
1818     CLG_(append_event)(CLG_(dumpmap), "DLdmr");
1819     CLG_(append_event)(CLG_(dumpmap), "DLdmw");
1820     CLG_(append_event)(CLG_(dumpmap), "Bc");
1821     CLG_(append_event)(CLG_(dumpmap), "Bcm");
1822     CLG_(append_event)(CLG_(dumpmap), "Bi");
1823     CLG_(append_event)(CLG_(dumpmap), "Bim");
1824     CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1825     CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1826     CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1827     CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1828     CLG_(append_event)(CLG_(dumpmap), "Ge");
1829     CLG_(append_event)(CLG_(dumpmap), "allocCount");
1830     CLG_(append_event)(CLG_(dumpmap), "allocSize");
1831     CLG_(append_event)(CLG_(dumpmap), "sysCount");
1832     CLG_(append_event)(CLG_(dumpmap), "sysTime");
1833 }
1834
1835
1836 /* this is called at dump time for every instruction executed */
1837 static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1838                                InstrInfo* ii, ULong exe_count)
1839 {
1840     if (!CLG_(clo).simulate_cache)
1841         cost[ fullOffset(EG_IR) ] += exe_count;
1842
1843     if (ii->eventset)
1844         CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
1845                                   ii->eventset, bbcc->cost + ii->cost_offset);
1846 }
1847
1848 static
1849 void cachesim_finish(void)
1850 {
1851   if (clo_collect_cacheuse)
1852     cacheuse_finish();
1853 }
1854
1855 /*------------------------------------------------------------*/
1856 /*--- The simulator defined in this file                   ---*/
1857 /*------------------------------------------------------------*/
1858
1859 struct cachesim_if CLG_(cachesim) = {
1860   .print_opts    = cachesim_print_opts,
1861   .parse_opt     = cachesim_parse_opt,
1862   .post_clo_init = cachesim_post_clo_init,
1863   .clear         = cachesim_clear,
1864   .getdesc       = cachesim_getdesc,
1865   .printstat     = cachesim_printstat,
1866   .add_icost     = cachesim_add_icost,
1867   .finish        = cachesim_finish,
1868
1869   /* these will be set by cachesim_post_clo_init */
1870   .log_1I0D        = 0,
1871   .log_2I0D        = 0,
1872   .log_3I0D        = 0,
1873
1874   .log_1I1Dr       = 0,
1875   .log_1I1Dw       = 0,
1876
1877   .log_0I1Dr       = 0,
1878   .log_0I1Dw       = 0,
1879
1880   .log_1I0D_name = "(no function)",
1881   .log_2I0D_name = "(no function)",
1882   .log_3I0D_name = "(no function)",
1883
1884   .log_1I1Dr_name = "(no function)",
1885   .log_1I1Dw_name = "(no function)",
1886
1887   .log_0I1Dr_name = "(no function)",
1888   .log_0I1Dw_name = "(no function)",
1889 };
1890
1891
1892 /*--------------------------------------------------------------------*/
1893 /*--- end                                                 ct_sim.c ---*/
1894 /*--------------------------------------------------------------------*/
1895