//$Id: pipelined-model.cc 1623 2006-09-01 19:34:36Z vdhar2 $
#include <stdio.h>
#include <inttypes.h>
#include <vector>
#include <unistd.h>
#include <fcntl.h>
#include <signal.h>
#include <time.h>

#include "circuit.h"
#include "sparse_memory.h"
#include "decode.h"
#include "ifetch.h"
#include "decode_stage.h"
#include "frontend_delay_stage.h"
#include "rename_stage.h"
//#include "acme_rename_stage.h"
#include "rob.h"
#include "scheduler.h"
#include "exec_unit.h"
#include "inorder_scheduler.h"
#include "arch-model.h"
#include "arch-listeners.h"


using namespace std;

// Debugging global variables decoded from command line
uint32_t debug_mask = 0;
uint32_t delay_debug_mask = 0;
bool debug_cycle = false;
FILE* debug_cyc_file = NULL;
bool progress_indicator = false;
uint32_t tornado_warning = 1000;
uint64_t cycle_count = 0; //cycle count, when NOT in fast-forward mode
uint64_t fast_forward_cycle_count = 0; //cycle count in fastforward mode
//note: cycle_count starts from zero when fast-forwarding ends and you enter full blown simulation
uint64_t num_fetched_instr = 0;
bool enable_ping = false; //enables a message to output screen every 100,000 instructions
bool skip_retire_check_failures = false; //when 'true', allows you to 'skip' over retire-check failures and still make forward progress

uint32_t num_retire_check_failures = 0; //number of retire-checks that failed

// Command line defined global variables
uint32_t bullet_proof_check_freq = 100000; // Number of cycles between each extensive state test
bool bullet_proof_check_cycle = false;
uint32_t num_phys_regs = NUM_ARCH_REGS + 1; // will be set as default value after cmd input
uint32_t default_num_phys_regs = 92;        // Number of physical registers for register renaming

// Scheduler / Rob
uint32_t issue_buffer_size = 64; // Number of entries in the issue buffer
bool     schd_reg_dep_first = true;
uint32_t rob_size = 64;

// Superpipeline
uint32_t superpipeline_factor = 4;
uint64_t fast_forward = 0; //number of instructions to fast-forward
bool cache_warmer = false; //default no cache warming with ffwding

// address generator
uint32_t ra_stack_size = 64;

// Cache/Memory
uint32_t iL1_lsize = 128;
uint32_t iL1_assoc = 2;
uint32_t iL1_lines = 32;
uint32_t iL1_delay = 10;

uint32_t dL1_lsize = 64;
uint32_t dL1_assoc = 4;
uint32_t dL1_lines = 32;
uint32_t dL1_delay = 10;

uint32_t L2_lsize = 128;
uint32_t L2_assoc = 8;
uint32_t L2_lines = 512;
uint32_t L2_delay = 100;

uint32_t default_phys_regs_per_flow = 128;
uint32_t num_contexts = 1;
uint64_t max_flow_instr = 1000000000;
uint64_t min_flow_space = 10000;

// Out of Order Loads and Stores
uint32_t stq_entries = 0;
uint32_t ldq_entries = 0;

uint32_t frontend_delay_size = 8;

bool ooo_sched_flg = false;

// Renaming
bool alpha_renaming = true;

bool          aggressive_fetch  = false;

//sig handler
bool          stop_int              = false;

// Stats
uint32_t exec_load_cnt               = 0;
uint32_t exec_store_cnt              = 0;
uint64_t exec_load_from_cache        = 0;

uint64_t the_instr_count = 0;
uint64_t the_retired_branch_count = 0;
uint64_t the_mispredicted_branch_count = 0;
uint64_t the_mispredicted_non_branch_count = 0;
uint64_t the_retired_load_count = 0;
uint64_t the_retired_store_count = 0;
uint64_t the_retired_misspec_load_count = 0;
uint64_t the_retired_misspec_store_count = 0;
uint64_t lsq_store_stall_cnt = 0;
uint64_t lsq_load_stall_cnt  = 0;

uint64_t icache_accesses = 0;
uint64_t icache_hits = 0;
uint64_t il2_hits = 0;
uint64_t il2_misses = 0;
uint64_t return_count = 0;
uint64_t mispredicted_return_count = 0;
uint64_t num_rename_stall_cycles = 0;
uint64_t num_scoreboard_stall_cycles = 0;
uint64_t num_rob_stall_cycles_ns = 0;
uint64_t num_lsq_stall_cycles_ns = 0;
uint64_t num_rob_stall_cycles_speculative = 0;
uint64_t num_lsq_stall_cycles_speculative = 0;
uint64_t cycles_waiting_to_retire = 0;

uint64_t syscall_flushes = 0;  //number of flushes due to syscalls. NOT same as the number of syscalls if doing alpha style renaming:
//syscalls then flush on execution instead of waiting till retirement. Make them wait till retirement??



// Usage/Help function called by command line decoding for statement
void
usage(void)
{
  printf("\n\tSIMULATOR\n");
  printf("\nUsage:\n\tpipelined-model [-info] \n");
  printf("\t\t[-superpipeline] [-bp_off] [-phys_regs <n>] [-sbrd_fifo <n>] [-rob_size <n>]\n");
  printf("\t\t[-cache_lsize <t> <n>] [-cache_lines <t> <n>] [-cache_assoc <t> <n>] [-cache_delay <t> <n>]\n");
  printf("\t\t[-debug <mask>] [-debug_delay <n>] [-tornado_warn <n>] [-stop <n>] {-thread exec_name args}\n");

  printf("\nOptional:  (In any order)\n\t-help,-h\tUsage text\n");
  printf("\t-info\t\tDisplays sizes you are using for various components\n");

  printf("\t-superpipeline <n>\tChanges superpipeline factor\n");
  printf("\t-bullet_proof <n>\tDeep architectural state checking every <n> cycles\n");
  printf("\t-phys_regs <n>\tSets number of physical regs for renaming (default = 64)\n");
  printf("\t-phys_regs_per_flow <n>\tSets number of physical regs allocated per flow(default = 128)\n");
  printf("\t-sbrd_fifo <n>\tSets number of scoreboard/replay fifo entries (default = 64)\n");
  printf("\t-rob_size <n>\tSets number of ROB entries (default = 64)\n");
  printf("\t-cache_lsize <t> <n>\tSets line size of cache (I1|D1|L2) to n Bytes \n\t\t\t\t(default: I1=128, D1=64, L2=128)\n");
  printf("\t-cache_lines <t> <n>\tSets number of lines of cache (I1|D1|L2) to n lines\n\t\t\t\t(default: I1=256, D1=32, L2=512)\n");
  printf("\t-cache_assoc <t> <n>\tSets associativity of cache (I1|D1|L2) to n ways \n\t\t\t\t(default: I1=1, D1=4, L2=8)\n");
  printf("\t-cache_delay <t> <n>\tSets delay to access next level for (I1|D1|L2) to n cycles \n\t\t\t\t(default: I1=10, D1=10, L2=100)\n");
  printf("\t-thread exec_name args\tThread to run on simulator (with arguments). Must be last simulator option. Must specify 1 or more threads.\n");
  printf("\t-debug <mask>\tUse bit mask, mask[31:0], to turn on debugging output:\n");
  printf("\t\t\tA default setting of all zeroes gives no ouput\n");
  printf("\t\t0.\tDebug info for instruction fetch stage\n");
  printf("\t\t1.\tDebug info for decoder stage\n");
  printf("\t\t2.\tDebug info for register renaming state\n");
  printf("\t\t3.\tDebug info for scheduler stage\n");
  printf("\t\t4.\tDebug info for the rob stage\n");
  printf("\t\t5.\tDebug info for register file stage\n");
  printf("\t\t6.\tDebug info for execution stage\n");
  printf("\t\t7.\tShow current and retirement RATs\n");
  printf("\t\t8.\tShow register renamer free list\n");
  printf("\t\t9.\tShow scheduler fifo destination regs\n");
  printf("\t\t10.\tShow ROB destination regs\n");
  printf("\t\t11.\tShow store addresses and data\n");
  printf("\t\t12.\tShow load addresses and data\n");
  printf("\t\t13.\tPrint architectural dest reg and data for each retiring instruction\n");
  printf("\t\t14.\tDebug info for caches\n");
  printf("\t-debug_delay <n>\tWait until this cycle n before displaying debug info\n");
  printf("\t-debug_ts <n>\tWait until instr time stamp is decoded before displaying debug info\n");
  printf("\t-tornado_warn <n>\tWarn if n cycles are executed without instruction retirement.\n");
  printf("\t-stop <n>\tCycles to run before stoping simulation\n");
  printf("\t-cache_warmer \tTurns on cache warming while ffwding\n");
  printf("\nRequired:  (Must go last)\n");
  printf("\texec_name\tFile name of executeable to be run on simulator\n");
  printf("\targs\t\tThe arguements to be inputed into simulated program\n");
  printf("\nExamples:\n\tExample of using debug mask with full debug info:\n");
  printf("\t\tDebug mask = 111111111111111 in binary or 0x7fff in hex\n");
  printf("\t\tpipelined-model -debug 0x7fff mipsver 5\n");
  printf("\tExample of just turning on debug info for instruction fetch:\n");
  printf("\t\tpipelined-model -debug 1 mipsver 5\n");
  printf("\tA particularly useful mode is:\n");
  printf("\t\tpipelined-model -trace -debug 0x2000 mipsver 5\n");
  printf("\tthis should produce identical output to:\n");
  printf("\t\tarch-model -trace -debug 0x2000 mipsver 5\n");
    
  printf("\n");
  exit(0);
}

void input_error(char* error_input, char* error_option) {
  printf("\nERROR: the value %s is invalid input for the option \"%s\"\n",
         error_input, error_option);
  usage();

  exit(0); 
}

void prog_comm_error(char* error_option) {
  printf("\nERROR: You have entered the option \"%s\" after already entering the\n",
         error_option);
  printf("\tsimulation program name\n");
  usage();

  exit(0);
}

bool thread_halted(arch_model** the_arch_model, uint32_t num_contexts) {
  for (uint32_t t=0; t<num_contexts; t++) {
    if (the_arch_model[t]->halted)
      return true;
  }
  return false;
}

//Disabled Bullet proof check in the absence of a renamer
//void account_phys_regs(rob& the_rob, const rename_stage& the_renamer, const scheduler_core& the_scheduler) {
//  vector<uint32_t> phys_regs(num_phys_regs, 0);
//  the_rob.account_phys_regs(phys_regs);
//  //the_renamer.account_phys_regs(phys_regs);
//  the_scheduler.account_phys_regs(phys_regs);
//
//  bool error = false;
//  for (uint32_t reg = 0; reg < num_phys_regs; reg++) {
//    if (phys_regs[reg] == 0) {
//      printf("LOST PHYS REG %u\n", reg);
//      std::cout<<"Cycle count = "<<cycle_count<<endl;
//      error = true;
//    }
//  }
//  
//  assert(!error && "Lost physical registers!");
//}

void sig_handler(int sig) {
  stop_int = true;
}

// warm all the caches using the given executed instr and cycle_count
void warm_caches(decoder::instr_h instr,uint64_t fast_forward_cycle_count, cache<uint64_t> &icache, cache<uint64_t> &dcache, cache<uint64_t> &l2_cache ) {

  //NOTE:
  //During ffwding a diff cycle_count is run from 0 to xxxx and then 
  //we shift back to cycle_count and run it from 0 to END. As a result 
  //maintaing cycle_count timestamps during cache warming is not productive 
  //and doesnt match with the cycle_counts for the new run.
  //Also couldnt figure out how to use conditional_update_value() methods

  uint32_t x_pc;
  uint32_t way = 0;
  uint64_t l2cycle_ready = 0;
  uint64_t cycle_ready = 0;
  bool c_hit = false;
 
  /////// first handle icache /////////
  x_pc = instr->program_counter;

  c_hit = icache.check(x_pc, way);
  if (c_hit) {
    //get the cycle ready, get_value updates lru
    cycle_ready = icache.get_value(x_pc, way);
  } else {
    // look in L2
    c_hit = l2_cache.check(x_pc, way);
    if (c_hit) {
      l2cycle_ready = l2_cache.get_value(x_pc, way);
//      if (l2cycle_ready <= fast_forward_cycle_count)
//        l2cycle_ready = fast_forward_cycle_count;
//      cycle_ready = l2cycle_ready + iL1_delay * superpipeline_factor;
    } else {
//      cycle_ready = fast_forward_cycle_count + L2_delay * superpipeline_factor;
      l2_cache.update_value(x_pc, cycle_ready);
    }
    icache.update_value(x_pc, cycle_ready);
  }

  /////// then calc dcache /////////
  if(instr->is_load || instr->is_store) {
    x_pc = instr->vaddr;

    c_hit = dcache.check(x_pc, way);
    if (c_hit) {
      //get the cycle ready, get_value updates lru
      cycle_ready = dcache.get_value(x_pc, way);
    } else {
      // look in L2
      c_hit = l2_cache.check(x_pc, way);
      if (c_hit) {
        l2cycle_ready = l2_cache.get_value(x_pc, way);
 //       if (l2cycle_ready <= fast_forward_cycle_count)
 //         l2cycle_ready = fast_forward_cycle_count;
 //       cycle_ready = l2cycle_ready + dL1_delay * superpipeline_factor;
      } else {
 //       cycle_ready = fast_forward_cycle_count + L2_delay * superpipeline_factor;
        l2_cache.update_value(x_pc, cycle_ready);
      }
      dcache.update_value(x_pc, cycle_ready);
    }
  }

}

int
main(int argc, char* argv[])
{
  // **** Declarations for command line decoding
  uint32_t cur_thread = 0;               // Simulated thread's index
  char*** sim_thread_argv = NULL;   // Simulated threads' argv
  int* sim_thread_argc = NULL;      // Simulated threads' argc
  bool read_sim_command = false;    // indicates beginging of thread info
  //int n = 0;

  bool stop_exec = false;
  uint32_t stop_after = 0;
  bool stop_instr = false;
  uint64_t stop_instr_count = 0ULL;
  bool info_on = false;
  uint64_t debug_delay = 0;
  uint32_t debugging_cycles = 0;
  uint64_t debug_ts = ~(uint64_t)0;
  uint32_t debugging_ts = 0;
  int i = 0;

  printf("command line used:\n");
  for (i=0; i<argc; i++) {
    printf("%s ", argv[i]);
  }
  printf("\n");


  signal(SIGINT, sig_handler);

  // **** PRINTS USAGE IF NO ARGS ARE FOUND WITH PROGRAM
  if(argc == 1)
    usage();

  // **** COMMAND LINE DECODE
  for(int i = 1; i < argc; i++) {
    // Help and usage output
    if(!strcmp(argv[i], "-help") || !strcmp(argv[i], "-h"))
      prog_comm_error("help");

    // warn if several cycles without rob commit
    else if(!strcmp(argv[i], "-tornado_warn")) {
      if (read_sim_command)
        prog_comm_error("tornado_warn");
      i++;
      tornado_warning = atoi(argv[i]);
      if(tornado_warning == 0)
        input_error(argv[i], "tornado_warn");
    }


    // Trace mode prints out the PC to compare with arch-model program flow
    else if(!strcmp(argv[i],"-progress")) {
      if (read_sim_command)
        prog_comm_error("progress");
      progress_indicator = true;
    }

    // Provides debug information as set by number based on which information
    // you would like to see
    else if(!strcmp(argv[i], "-debug")) {
      if (read_sim_command)
        prog_comm_error("debug");
      i++;
      debug_mask = strtoul(argv[i], 0, 0); // allow octal, decimal or hex input
      if(debug_mask == 0)
        input_error(argv[i], "debug");
    }

    // Allows you to stop simulator after a certain number of statereg cycles
    else if(!strcmp(argv[i], "-stop")) {
      if (read_sim_command)
        prog_comm_error("stop");
      i++;

      stop_after = atoi(argv[i]);
      if(stop_after == 0)
        input_error(argv[i], "stop");
      stop_exec = true;
    }

    // stop simulator after a certain number of instructions retire
    else if(!strcmp(argv[i], "-stop_instr")) {
      if (read_sim_command)
        prog_comm_error("stop_instr");
      stop_instr = true;
      stop_instr_count = atoi(argv[++i]);
    }

    // Allows to delay debug output until the given cycle
    else if(!strcmp(argv[i], "-debug_delay")) {
      if (read_sim_command)
        prog_comm_error("debug_delay");
      i++;
      debug_delay = strtoull(argv[i], NULL, 0);
      if(debug_delay == 0)
        input_error(argv[i], "debug_delay");
      debugging_cycles = true;
    }

    // Allows to delay debug output until the given cycle
    else if(!strcmp(argv[i], "-debug_ts")) {
      if (read_sim_command)
        prog_comm_error("debug_ts");
      i++;
      debug_ts = strtoull(argv[i], NULL, 0);
      if(debug_ts == 0)
        input_error(argv[i], "debug_ts");
      debugging_ts = true;
    }
    
    else if(!strcmp(argv[i], "-debug_cycle")) {
      if (read_sim_command)
        prog_comm_error("debug_cycle");

      debug_cycle = true;
      i++;
      debug_cyc_file = fopen(argv[i], "w");
      assert(debug_cyc_file);
    }

    // Sets cache line size
    else if(!strcmp(argv[i], "-cache_lsize")) {
      if (read_sim_command)
        prog_comm_error("cache_lsize");

      i++;
      switch(*argv[i]) {
      case 'D': case 'd':
        dL1_lsize = atoi(argv[i+1]);
        break;
      case 'I': case 'i':
        iL1_lsize = atoi(argv[i+1]);
        break;
      case 'L': case 'l':
        L2_lsize = atoi(argv[i+1]);
        break;
      default:
        input_error(argv[i], "cache_lsize");
          break;
      }

      i++;

      if(atoi(argv[i]) <= 0)
        input_error(argv[i], "cache_lsize");

    }

    // Sets cache associativity
    else if(!strcmp(argv[i], "-cache_assoc")) {
      if (read_sim_command)
        prog_comm_error("cache_assoc");

      i++;
      switch(*argv[i]) {
      case 'D': case 'd':
        dL1_assoc = atoi(argv[i+1]);
        break;
      case 'I': case 'i':
        iL1_assoc = atoi(argv[i+1]);
        break;
      case 'L': case 'l':
        L2_assoc = atoi(argv[i+1]);
        break;
      default:
        input_error(argv[i], "cache_assoc");
        break;
      }
      i++;

      if(atoi(argv[i]) <= 0)
        input_error(argv[i], "cache_assoc");

    }

    // Sets cache lines
    else if(!strcmp(argv[i], "-cache_lines")) {
      if (read_sim_command)
        prog_comm_error("cache_lines");

      i++;
      switch(*argv[i]) {
      case 'D': case 'd':
        dL1_lines = atoi(argv[i+1]);
        break;
      case 'I': case 'i':

        iL1_lines = atoi(argv[i+1]);
        break;
      case 'L': case 'l':
        L2_lines = atoi(argv[i+1]);
        break;
      default:
        input_error(argv[i], "cache_lines");
        break;
      }
      i++;

      if(atoi(argv[i]) <= 0)
        input_error(argv[i], "cache_lines");
    }

    // Sets cache delay
    else if(!strcmp(argv[i], "-cache_delay")) {
      if (read_sim_command)
        prog_comm_error("cache_delay");
      i++;

      switch(*argv[i]) {
      case 'D': case 'd':
        dL1_delay = atoi(argv[i+1]);

        break;
      case 'I': case 'i':
        iL1_delay = atoi(argv[i+1]);
        break;
      case 'L': case 'l':
        L2_delay = atoi(argv[i+1]);
        break;
      default:
        input_error(argv[i], "cache_delay");
        break;
      }
      i++;

      if(atoi(argv[i]) <= 0)
        input_error(argv[i], "cache_delay");
    }

    else if(!strcmp(argv[i], "-phys_regs")) {
      if (read_sim_command)
        prog_comm_error("phys_regs");
      i++;
      num_phys_regs = atoi(argv[i]);
    }

    else if(!strcmp(argv[i], "-phys_regs_per_flow")) {
      if (read_sim_command)
        prog_comm_error("phys_regs_per_flow");
      i++;
      default_phys_regs_per_flow = atoi(argv[i]);
    }

    else if(!strcmp(argv[i], "-frontend_cycles")) {
      if (read_sim_command)
        prog_comm_error("-frontend_cycles");
      i++;
      frontend_delay_size = atoi(argv[i]);
    }

    else if(!strcmp(argv[i], "-rob_size")) {
      if (read_sim_command)
        prog_comm_error("rob_size");
      i++;
      rob_size = atoi(argv[i]);
    }

    else if(!strcmp(argv[i], "-cache_warmer")) {
      if (read_sim_command)
        prog_comm_error("cache_warmer");
      cache_warmer = true;
    }

    // Allows you to display the cache and latency settings
    else if(!strcmp(argv[i], "-info")) {
      if (read_sim_command)
        prog_comm_error("info");
      info_on = true;
    }

    // set superpipelining factor
    else if(!strcmp(argv[i], "-superpipeline")) {
      if (read_sim_command)
        prog_comm_error("superpipeline");
      i++;
      superpipeline_factor = atoi(argv[i]);
    }

    // set size of return address stack
    else if(!strcmp(argv[i], "-ra_stack_size")) {
      if (read_sim_command)
        prog_comm_error("ra_stack_size");
      i++;
      ra_stack_size = atoi(argv[i]);
    }


    // turn off the conservative limit of fetching only 1 branch per real cycle
    else if(!strcmp(argv[i], "-aggressive_fetch")) {
      if (read_sim_command)
        prog_comm_error("aggressive_fetch");
      aggressive_fetch = true;
    }

    // enable pentium 4 style renaming
    else if(!strcmp(argv[i], "-p4_renaming")) {
      if (read_sim_command)
        prog_comm_error("p4_renaming");
      alpha_renaming = false;
    }

    // enable pentium 4 style renaming
    else if(!strcmp(argv[i], "-ooo_scheduler")) {
      if (read_sim_command)
        prog_comm_error("ooo_scheduler");
      ooo_sched_flg = true;
    }

    // set the bullet_proof frequecy check
    else if(!strcmp(argv[i], "-bullet_proof")) {
      if (read_sim_command)
        prog_comm_error("bullet_proof");
      bullet_proof_check_freq = atoi(argv[++i]);
      assert((bullet_proof_check_freq > 1)
             &&"Bullet proof check frequency must be greater than 1.");
    }
 
    // set the sizes of the load and store queues
    else if (!strcmp(argv[i], "-lsq_size")) {
      if (read_sim_command)
        prog_comm_error("-lsq_size");
      ldq_entries = atoi(argv[++i]);
      stq_entries = atoi(argv[++i]);
    }

    // SMT: number of contexts available to run threads
    else if(!strcmp(argv[i], "-num_contexts") || !strcmp(argv[i], "-num_threads")) {
      if (read_sim_command)
        prog_comm_error("num_contexts");
      i++;
      num_contexts = atoi(argv[i]);

      sim_thread_argc = new int[num_contexts];
      sim_thread_argv = new char**[num_contexts];

      for (uint32_t x=0; x < num_contexts; x++) {
        sim_thread_argc[x] = -1;
        sim_thread_argv[x] = NULL;
      }
    }

    // traverse register deps round robin
    else if(!strcmp(argv[i], "-schd_reg_dep_rr")) {
      if (read_sim_command)
        prog_comm_error("schd_reg_dep_rr");
      schd_reg_dep_first = false;
    }

    // issue buffer size
    else if(!strcmp(argv[i], "-schd_issue_buffer_size")) {
      if (read_sim_command)
        prog_comm_error("schd_issue_buffer_size");
      i++;
      issue_buffer_size = atoi(argv[i]);
    }

    else if(!strcmp(argv[i], "-sbrd_fifo")) {
      if (read_sim_command)
        prog_comm_error("sbrd_fifo");

      printf("-----warning: -sbrd_fifo is a depricated option, replaced with -schd_issue_buffer_size\n");
      i++;
      issue_buffer_size = atoi(argv[i]);
    }

    //enable pinging
    else if(!strcmp(argv[i], "-enable_ping")) {
      if (read_sim_command)
        prog_comm_error("enable_ping");
      enable_ping = true;
    }

    else if(!strcmp(argv[i], "-skip_retire_check_failures")) {
      if (read_sim_command)
        prog_comm_error("skip_retire_check_failures");
      skip_retire_check_failures = true;
    }
    else if(!strcmp(argv[i], "-fast_forward")) {
      if (read_sim_command)
        prog_comm_error("fast_forward");
      i++;
      fast_forward = atoi(argv[i]);
    }

    // SMT: extract one set of argv/argc per thread
    else if(!strcmp(argv[i], "-thread")) {
      if (sim_thread_argv == 0) {
        assert(num_contexts < 2);
        assert(sim_thread_argc == 0);
        sim_thread_argc = new int[1];
        sim_thread_argv = new char**[1];
      }

      read_sim_command = true;
      i++;  // move past the "-thread" parameter

      // count args for thread (argc)
      int base_arg = i;
      sim_thread_argc[cur_thread] = 0;
      while ((base_arg < argc) && strcmp(argv[base_arg], "-thread")) {
        sim_thread_argc[cur_thread]++;
        base_arg++;
      }

      // copy args for thread (argv)
      base_arg = i;
      sim_thread_argv[cur_thread] = new char*[sim_thread_argc[cur_thread]];
      while ((i < argc) && strcmp(argv[i], "-thread")) {
        sim_thread_argv[cur_thread][i-base_arg] = argv[i];
        i++;
      }

      i--; // don't move past the next "-thread" parameter
      cur_thread++;
    }
    else {
      printf("\n\n Unable to understand argument %s\n\n", argv[i]);
      usage();
    }

  }
  
  if(num_contexts == 0) {
    printf("Please specify -num_contexts <n>\n");
    exit(1);
  }

  // set default physical registers
  if(!num_phys_regs) {
    num_phys_regs = default_num_phys_regs;
  }

  assert(num_phys_regs > NUM_ARCH_REGS * num_contexts);

  // set default lsq/stq sizes
  if(!stq_entries)
    stq_entries = rob_size;
  if(!ldq_entries)
    ldq_entries = rob_size;

  // set default tornado_warning
  tornado_warning = 100*rob_size;

  // delay displaying debugging information until triggered
  if(debugging_ts || debugging_cycles) {
    delay_debug_mask = debug_mask;
    debug_mask = 0;
  }

  // **** MEMORY STRUCTURE DECLARATIONS ****
  sparse_memory* the_mem;
  arch_model** the_arch_model;
  track_last_writer** the_last_writer_tracker;
  track_call_depth** the_call_depth_tracker;

  the_mem = new sparse_memory[num_contexts];               // SMT: one sparse_mem per thread
  the_arch_model = new arch_model*[num_contexts];          // SMT: one arch_model per thread
  the_last_writer_tracker = new track_last_writer*[num_contexts];
  the_call_depth_tracker = new track_call_depth*[num_contexts];

  for (cur_thread=0; cur_thread<num_contexts; cur_thread++) {
    //the_arch_model[cur_thread] = new arch_model(&the_mem[cur_thread], false);
    the_arch_model[cur_thread] = new arch_model(&the_mem[cur_thread], true);
    the_last_writer_tracker[cur_thread] = new track_last_writer(the_arch_model[cur_thread]);
    the_call_depth_tracker[cur_thread] = new track_call_depth(the_arch_model[cur_thread]);
    the_arch_model[cur_thread]->register_last_writer_tracker(the_last_writer_tracker[cur_thread]);
  }

  //figure out the right size of the delay fifo, make sure its >=2
  frontend_delay_size = frontend_delay_size * superpipeline_factor - 7;
  frontend_delay_size = (frontend_delay_size > 1) ? frontend_delay_size : 2;


  //do the fast-forwarding. Fast forward ALL threads if in SMT mode
  uint32_t load_contexts = num_contexts;

  // SMT: load one elf file per thread
  int retval = 0;
  // SMT: use a single dri to iteratively initialize pc's, mem spaces, and register files for all threads
  datapath_reg_init dri;


  //load elf files for all threads
  for(uint32_t current_thread=0; current_thread< load_contexts; current_thread++) {
    retval = the_arch_model[current_thread]->load_elf_file(sim_thread_argv[current_thread][0],  sim_thread_argc[current_thread],
                                                       sim_thread_argv[current_thread],     &dri);
    if (retval == 0) {
      // init registers for the arch_model that represents current_thread
      the_arch_model[current_thread]->init_regs(&dri);
    }
    else {
      fprintf(stderr, "error loading program image %s\n", sim_thread_argv[current_thread][0]);
      exit(1);
    }
  }


  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // DECLARE ALL THE CACHE'S
  //
  cache<uint64_t> icache = cache<uint64_t>(iL1_lines, iL1_lsize, lg(iL1_lsize), iL1_assoc);
  cache<uint64_t> dcache = cache<uint64_t>(dL1_lsize, dL1_lsize, lg(dL1_lsize), dL1_assoc);
  cache<uint64_t> l2_cache = cache<uint64_t>(L2_lines, L2_lsize, lg(L2_lsize), L2_assoc);   
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  
  //NOW, step all the arch models one instruction at a time
  if(fast_forward)
    printf("Entering fast forward mode. Will fast forward for %llu cycles: , cache_warming:%d\n", fast_forward, cache_warmer);
  
  decoder::instr_h instr;
  
  for(; fast_forward_cycle_count< fast_forward && !thread_halted(the_arch_model, num_contexts); fast_forward_cycle_count++) {
    for(uint32_t current_thread=0; current_thread< load_contexts; current_thread++) {
      instr = the_arch_model[current_thread]->do_cycle();

      if(cache_warmer) {
        warm_caches(instr, fast_forward_cycle_count, icache, dcache, l2_cache );
      }
    }
  }


  if(fast_forward) {
    if(!thread_halted(the_arch_model, num_contexts))
       printf("Finished fast forward. Now starting full simulation\n");
    else
      printf("Processor halted while in fast-forward mode\n");
  }


  //set em_syscalls to false in arch_model. Since fast-forward is finished,
  //it no longer needs to emulate syscalls
  the_arch_model[0]->set_emulate_syscalls(false);

  // **** PIPE STAGE DECLARATIONS ****
  rob the_rob(the_arch_model);
  execution_unit the_exec_unit(the_mem, dcache, l2_cache);

  scheduler_core *tmp_scheduler; 
  scheduler *ooo_scheduler;
  inorder_scheduler *io_scheduler;
  if(ooo_sched_flg) {
    ooo_scheduler = new scheduler(&the_exec_unit);
    tmp_scheduler = ooo_scheduler; 
  } else {
    io_scheduler = new inorder_scheduler(&the_exec_unit);
    tmp_scheduler = io_scheduler;
  }
  scheduler_core &the_scheduler = *(tmp_scheduler); 
  
  rename_stage dummy_rename_stage;
  rename_stage_core &the_renamer = dummy_rename_stage;
  //acme_rename_stage acme_renamer;
  //rename_stage_core &the_renamer = acme_renamer;
  
  decode_stage the_decoder;
  ifetch the_ifetch(the_mem, icache, &(the_exec_unit.l2_cache));
  frontend_delay_stage the_frontend_delay(frontend_delay_size);

  // CONNECT UP PIPELINE
  // Use attach statements to attach inports to stateregs which can be
  // considered outputs when connecting pipeline stages

  // **** CONNECT INSTRUCTON FETCH INPORTS ****
  the_ifetch.simpanic_in.attach(&the_rob.simpanic_out);
  the_ifetch.branch_exec.attach(&the_exec_unit.branch_output);
  the_ifetch.syscall_exec.attach(&the_exec_unit.syscall_output);
  the_ifetch.mem_access_exec.attach(&the_exec_unit.mem_access_output);
  the_ifetch.scoreboard_stall.attach(&the_scheduler.scoreboard_stall);
  the_ifetch.rename_stall.attach(&the_renamer.rename_stall);
  the_ifetch.rob_stall.attach(&the_rob.rob_stall);
  the_ifetch.lsq_stall.attach(&the_exec_unit.lsq_stall);
  the_ifetch.branch_ret.attach(&the_rob.branch_out);
  the_ifetch.retiring_instr.attach(&the_rob.retiring_instr_out);

  // **** CONNECT INSTRUCTION DECODER INPORTS ****
  the_decoder.simpanic_in.attach(&the_rob.simpanic_out);
  the_decoder.instr_in.attach(&the_ifetch.notdecoded_instr);  
  the_decoder.program_counter_in.attach(&the_ifetch.prev_program_counter);
  the_decoder.predicted_pc_in.attach(&the_ifetch.prev_pc_prediction);
  the_decoder.context.attach(&the_ifetch.prev_pc_context);
  the_decoder.btb_miss_in.attach(&the_ifetch.btb_miss_out);
  the_decoder.ra_stack_ptr_in.attach(&the_ifetch.ra_stack_ptr);
  the_decoder.ra_stack_head_in.attach(&the_ifetch.ra_stack_head);
  the_decoder.branch_exec.attach(&the_exec_unit.branch_output);
  the_decoder.syscall_exec.attach(&the_exec_unit.syscall_output);
  the_decoder.mem_access_exec.attach(&the_exec_unit.mem_access_output);
  the_decoder.scoreboard_stall.attach(&the_scheduler.scoreboard_stall);
  the_decoder.rename_stall.attach(&the_renamer.rename_stall);
  the_decoder.rob_stall.attach(&the_rob.rob_stall);

  // **** CONNECT FRONTEND STALL INPORTS ****
  the_frontend_delay.simpanic_in.attach(&the_rob.simpanic_out);
  the_frontend_delay.instr_in.attach(&the_decoder.instr_out);
  the_frontend_delay.branch_exec.attach(&the_exec_unit.branch_output);
  the_frontend_delay.syscall_exec.attach(&the_exec_unit.syscall_output);
  the_frontend_delay.mem_access_exec.attach(&the_exec_unit.mem_access_output);
  the_frontend_delay.scoreboard_stall.attach(&the_scheduler.scoreboard_stall);
  the_frontend_delay.rename_stall.attach(&the_renamer.rename_stall);
  the_frontend_delay.rob_stall.attach(&the_rob.rob_stall);

  // **** CONNECT RENAMER INPORTS ****
  the_renamer.simpanic_in.attach(&the_rob.simpanic_out);
  the_renamer.instr_in.attach(&the_frontend_delay.instr_out);
  the_renamer.branch_exec.attach(&the_exec_unit.branch_output);
  the_renamer.syscall_exec.attach(&the_exec_unit.syscall_output);
  the_renamer.mem_access_exec.attach(&the_exec_unit.mem_access_output);
  the_renamer.scoreboard_stall.attach(&the_scheduler.scoreboard_stall);
  the_renamer.rob_stall.attach(&the_rob.rob_stall);
  the_renamer.branch_rob.attach(&the_rob.branch_out);
  the_renamer.mem_access_rob.attach(&the_rob.mem_access_out);
  the_renamer.retiring_instr.attach(&the_rob.retiring_instr_out);

  // **** CONNECT SCHEDULER INPORTS ****
  the_scheduler.simpanic_in.attach(&the_rob.simpanic_out);
  the_scheduler.instr_in.attach(&the_renamer.instr_out);
  the_scheduler.decode_instr_num.attach(&the_decoder.instr_num);
  the_scheduler.rob_head.attach(&the_rob.rob_head);
  the_scheduler.branch_exec.attach(&the_exec_unit.branch_output);
  the_scheduler.syscall_exec.attach(&the_exec_unit.syscall_output);
  the_scheduler.mem_access_exec.attach(&the_exec_unit.mem_access_output);
  the_scheduler.writeback_bus.attach(&the_exec_unit.writeback_bus);
  the_scheduler.store_bus.attach(&the_exec_unit.store_bus_sched);
  the_scheduler.frontend_delay_counts.attach(&the_frontend_delay.instr_counts);
  the_scheduler.retiring_instr.attach(&the_rob.retiring_instr_out);

  // **** CONNECT REORDER BUFFER INPORTS ****
  the_rob.instr_in.attach(&the_renamer.instr_out);
  the_rob.branch_exec.attach(&the_exec_unit.branch_output);
  the_rob.syscall_exec.attach(&the_exec_unit.syscall_output);
  the_rob.writeback_bus.attach(&the_exec_unit.writeback_bus);
  the_rob.store_bus.attach(&the_exec_unit.store_bus_rob);
  the_rob.mem_access_exec.attach(&the_exec_unit.mem_access_output);

  // **** CONNECT EXECUTION STAGE INPORTS ****
  the_exec_unit.simpanic_in.attach(&the_rob.simpanic_out);
  the_exec_unit.instr_in.attach(&the_scheduler.instr_out);
  the_exec_unit.mem_access_rob.attach(&the_rob.mem_access_out);
  the_exec_unit.branch_rob.attach(&the_rob.branch_out);
  the_exec_unit.retiring_instr.attach(&the_rob.retiring_instr_out);

  // exit if rob size it too small
  if(the_rob.stall_prev_stages(0)) {
    printf("ERROR:  ROB size is too small to support the superpipeline factor! (exiting)\n");
    exit(1);
  }

  //initialise the pipeline stages with updated reg_file and PC from fast-forwarded arch-models

  for (cur_thread=0; cur_thread<load_contexts; cur_thread++) {
    // SMT: initialization of per-thread pc's from fast-forwarded arch_model
    the_ifetch.pc[cur_thread] = the_arch_model[cur_thread]->get_pc();
    
    // SET REGISTERS WITH POINTERS TO MAIN'S ARGV AND ARGC
    
    // SMT: modified reg_init to partition reg file among threads and do an init for each thread
    //modified again: exec_unit is passed reg_file from arch_model
    the_exec_unit.reg_init(the_arch_model[cur_thread]->get_reg_file(), cur_thread);
  }
  // set the program counter to the first context's pc
  the_ifetch.program_counter = the_ifetch.pc[0];

  // **** THIS CODE RUNS PROGRAM - PROGRAM RUNS UNTIL HALT SIGNAL IS RECEIVED
  cycle_count = 0;

  time_t first_time = time(0);
  time_t last_time = first_time;         // time in seconds
  uint64_t last_cycle_count = 0;
  uint64_t last_instr_count = 0;

  // SMT: quit when any of the threads halts
  while (!thread_halted(the_arch_model, num_contexts) && !(stop_exec && (stop_after-- <= 0)) &&
         !(stop_instr && (the_instr_count >= stop_instr_count)) && !stop_int ) {
    if ((debugging_cycles && (debug_delay == cycle_count)) || 
        (debugging_ts && (debug_ts == the_decoder.instr_out()->instr_num))) {
      debug_mask = delay_debug_mask;
    }

    if (debug_cycle && (cycle_count >= 10000) && ((cycle_count % 10000) == 0)) {
      fprintf(debug_cyc_file, "cycle = %llu, instr = %llu\n", cycle_count, the_instr_count);
      fflush(debug_cyc_file);
    }

    if ((debug_mask > 0))
      printf("--------------- cycle= %llu\t-----------------------------------------\n", cycle_count);

    // every bullet_proof cycle, verify that no physical registers have been lost
    //bullet_proof_check_cycle = ((cycle_count | 1) % bullet_proof_check_freq == 1);
    //if(bullet_proof_check_cycle) {
    //  account_phys_regs(the_rob, the_renamer, the_scheduler);
    //}

    clocked_net::pulse(); // Toggles statereg clocks to change values like a register in real circuits.
    circuit::level();     // Updates each pipe stage based on new statereg inputs
    cycle_count++;

    if(enable_ping && (cycle_count % PING_FREQ == 0) ) {
      printf("Cycle %llu\n", cycle_count);
    }


    if (progress_indicator && ((cycle_count % 512) == 0)) {
      time_t this_time = time(0);
      if ((this_time - last_time) >= 30) {
        double epoch_secs = this_time - last_time;
        double epoch_clocks = cycle_count - last_cycle_count;
        double epoch_instrs = the_instr_count - last_instr_count;
        fprintf(stdout, "\n[cc:%llu (%.3f)\t%llu (%.3f)\t%.3f\t%.3f]\n",
                cycle_count,
                epoch_clocks/epoch_secs,
                the_instr_count,
                epoch_instrs/epoch_secs,
                ((double)superpipeline_factor * epoch_instrs) / epoch_clocks,
                (((double)superpipeline_factor * (double)the_instr_count) /
                 (double)cycle_count));

        last_time = this_time;
        last_cycle_count = cycle_count;
        last_instr_count = the_instr_count;
      }
    }
  }

  time_t finish_time = time(0);

  printf("\n--SIMULATION STATS--\n");

  printf("\narch_model::sparse_memory_stats> ");  the_arch_model[0]->print_sparse_memory_stats();

  printf("\nnum_phys_regs: %d\n", num_phys_regs);
  printf("issue_buffer_size: %d\n", issue_buffer_size);

  printf("\n--Performance Stats--\n");

  double ipc = (double) the_instr_count / (double) cycle_count;
  printf("\nNumber of cycles run: %llu\nNumber of instructions: %llu\n",
         cycle_count, the_instr_count);
  printf("utilization: %f\n\n", ipc);

  if (superpipeline_factor > 1) {
    printf("\n-----------------------------------------------------------\n");
    printf("\n\n%dx clock gives equivalent speed to single-pump IPC of %.3f\n\n",
           superpipeline_factor, (double)superpipeline_factor * ipc);
    printf("\n-----------------------------------------------------------\n");
  }

  printf("Number of instructions fetched: %llu\n", num_fetched_instr);
  printf("Instructions fetched per cycle: %5.3f\n\n", (double) num_fetched_instr / (double) cycle_count);

  printf("Cycles rename stalled:  %llu\n", num_rename_stall_cycles);
  printf("Cycles scoreboard stalled:  %llu\n", num_scoreboard_stall_cycles);
  printf("Non-speculative context stall stats:\n");
  printf("Cycles rob stalled:  %llu\n", num_rob_stall_cycles_ns);
  printf("Cycles lsq stalled:  %llu\n\n", num_lsq_stall_cycles_ns);
  printf("Cumulative, speculative context stall stats:\n");
  printf("rob stall signals:  %llu\n", num_rob_stall_cycles_speculative);
  printf("lsq stall signals:  %llu\n\n", num_lsq_stall_cycles_speculative);

  printf("avg. CPI waiting for retirement post execution: %.3f\n\n", (double)cycles_waiting_to_retire / (double) the_instr_count);

  printf("syscall flushes:  %llu\n\n", syscall_flushes);

  printf("icache accesses: %llu, icache misses %llu, %5.3f%%\n\n", icache_accesses, icache_accesses - icache_hits,
         ((float) icache_hits) / icache_accesses );

  printf("l2 icache accesses: %llu, icache misses %llu, %5.3f%%\n\n", il2_hits + il2_misses, il2_misses,
         ((float) il2_hits) / (il2_hits + il2_misses) );

  printf("cache accesses: %llu, cache misses: %llu, %5.3f%%\n\n", the_exec_unit.hits + the_exec_unit.misses, the_exec_unit.misses, 
         ((float) the_exec_unit.hits ) / (the_exec_unit.hits + the_exec_unit.misses));

  printf("dl2cache accesses: %llu, l2cache misses: %llu, %5.3f%%\n\n", the_exec_unit.l2hits + the_exec_unit.l2misses, the_exec_unit.l2misses, 
         ((float) the_exec_unit.l2hits ) / (the_exec_unit.l2hits + the_exec_unit.l2misses));

  printf("branches: %llu, mispredicted %llu\n\tmispredictiction rate: %.1f%%\n",
         the_retired_branch_count, the_mispredicted_branch_count,
         100.0 * (double)the_mispredicted_branch_count / (double)the_retired_branch_count);

  printf("\tnon-branches mispredicted: %llu\n", the_mispredicted_non_branch_count);

  printf("returns: %llu, mispredicted %llu\n\tmisprediction rate: %.1f%%\n\taccounts for %.1f%% of mispredictions\n\tand %.1f%% of branches\n",
         return_count, mispredicted_return_count,
         100.0 * (double)mispredicted_return_count / (double)return_count,
         100.0 * (double)mispredicted_return_count / (double)the_mispredicted_branch_count,
         100.0 * (double)return_count / (double)the_retired_branch_count);

  printf("memory accesses: %llu, misspeculated %llu\n\tmem access misspeculation rate: %.2f%%\n",
         the_retired_load_count + the_retired_store_count, the_retired_misspec_load_count + the_retired_misspec_store_count,
         100.0 * (double)(the_retired_misspec_load_count + the_retired_misspec_store_count) / (double)(the_retired_load_count + the_retired_store_count));
  printf("loads:  %llu, misspeculated %llu\n\tld misspeculation rate: %.2f%%\n",
         the_retired_load_count, the_retired_misspec_load_count,
         100.0 * (double)(the_retired_misspec_load_count) / (double)(the_retired_load_count));
  printf("stores: %llu, misspeculated %llu\n\tst misspeculation rate: %.2f%%\n",
         the_retired_store_count, the_retired_misspec_store_count,
         100.0 * (double)(the_retired_misspec_store_count) / (double)(the_retired_store_count));

  if (debug_cyc_file)
    fclose(debug_cyc_file);

  printf("simulator speed: %.3fKHz\n",
         (double)cycle_count / ((double)(finish_time - first_time) * 1000.0));

  delete [] sim_thread_argc;
  delete [] sim_thread_argv;
  delete [] the_mem;
  delete [] the_arch_model;
}

// debug masks
// 0x00000001 - ifetch stage
// 0x00000002 - decode stage
// 0x00000004 - rename stage
// 0x00000008 - scoreboard stage
// 0x00000010 - rob stage
// 0x00000040 - exec_unit
// 0x00000080 - RAT and RRAT
// 0x00000100 - free list
// 0x00000400 - rob destination registers
// 0x00000800 - store addresses and data
// 0x00001000 - load addresses and data
// 0x00002000 - arch destination reg and data for each retiring instruction
