#include "rename_stage.h"


rename_stage::rename_stage(const vector<uint64_t> *rf) :
    
  // inputs
  instr_in(),
  branch_exec(),
  mem_access_exec(),
  scoreboard_stall(),
  rob_stall(),
  retiring_instr(),
  syscall_exec(),
  simpanic_in(),
   
  // outputs
  instr_out(decoder::noop),
  rename_stall(false),

  //internal state
  reg_kill_count(NUM_ARCH_REGS, 0),
   
  // data structures
  the_rat(num_contexts*rob_size, rename_state(0)),
  the_rrat(num_contexts, rename_state(0)),
  FL(num_phys_regs - num_contexts*NUM_ARCH_REGS, num_contexts*NUM_ARCH_REGS),
  //    active_rat(num_contexts, 0),
  reg_access_count(num_phys_regs, 0),
  free_reg_stall(false),
  p4rename_stall(false),
  earliest_recovery_ts(0ULL),
  simpanic_recovery_mode(false)
{
  active_rat = new uint32_t[num_contexts];
  for (uint32_t context=0; context < num_contexts; context++) {
      the_rat[index(context, 0)] = rename_state(context*NUM_ARCH_REGS);
      the_rrat[context] = rename_state(context*NUM_ARCH_REGS);
  }
  initialize_rats(0);
}
  


// index function into a rat:  there are rob_size number of rats per context (silo)
inline uint32_t rename_stage::index(uint32_t context, uint32_t rat_index) const {
  return ((context * rob_size) + rat_index);
}

void rename_stage::free_register(uint32_t phys_reg) {
  if(debug_mask & 0x0004) printf("RENAME> FREE_REGISTER: freeing register $%u ref count of %u\n", phys_reg, reg_access_count[phys_reg]);
  if(reg_access_count[phys_reg]) {
    if(!(--reg_access_count[phys_reg])) FL.put_free_obj(phys_reg);
  }
  else {
    printf("RENAME> FREE_REGISTER: freeing register $%u failed due to reference count of %u\n", phys_reg, reg_access_count[phys_reg]);
    printf("Cycle: %llu\n", cycle_count);
    exit(1);
  }
}

inline void rename_stage::get_register(uint32_t& phys_reg) {
  FL.get_free_obj(phys_reg);
  reg_access_count[phys_reg]++;
}

inline void rename_stage::free_registers_used(uint32_t context) {
  for(uint32_t arch_reg = 1; arch_reg < NUM_ARCH_REGS; ++arch_reg) {
    uint32_t phys_reg = the_rrat[context].get_phys_reg(arch_reg);
    if(phys_reg && reg_access_count[phys_reg]) {
      if(!(--reg_access_count[phys_reg])) FL.put_free_obj(phys_reg);
    }
    else if(phys_reg && !reg_access_count[phys_reg]) {
      printf("RENAME> FREE_REGISTERS_USED: freeing register $%u failed due to reference count of %u\n", phys_reg, reg_access_count[phys_reg]);
      printf("Cycle: %llu\n", cycle_count);
      exit(1);
    }
  }
}

void rename_stage::mark_registers_used(uint32_t context) {
  for(uint32_t arch_reg = 1; arch_reg < NUM_ARCH_REGS; ++arch_reg) {
    ++reg_access_count[the_rat[index(context, active_rat[context])].get_phys_reg(arch_reg)];
  }
}


void rename_stage::initialize_rats(uint32_t starting_context) 
{
  //start with empty freelist
  FL.reset();

  //reset ref counts to 0
  vector<uint32_t> temp_reg_access_count(num_phys_regs, 0);
  reg_access_count = temp_reg_access_count;

  //FIXME: Kshitiz: This doesn't seem to work for SMT. Threads DON't get the correct phys regs assigned to them
  // SMT: map each thread's tables to the thread's initial segment of the physical register file
  for (uint32_t context=0; context < num_contexts; context++) {
    for(uint32_t phys_reg = context*NUM_ARCH_REGS; phys_reg < (context + 1)*NUM_ARCH_REGS; ++phys_reg) {
      reg_access_count[phys_reg] = 1;
    }
    active_rat[context] = 0;
  }
    
  // clean up initialization so that each thread uses physical register 0 as the zero register
  for (uint32_t context=1; context<num_contexts; context++) {
    reg_access_count[the_rat[index(context, 0)].get_phys_reg(0)] = 0;
    FL.put_free_obj(the_rat[index(context, 0)].get_phys_reg(0) );
    the_rat[index(context, 0)].set_phys_reg(0, 0);
    the_rrat[context].set_phys_reg(0, 0);
  }
}

inline void rename_stage::print_debug_info() {
  if (debug_mask & 0x00000004) {
    instr_out()->print("RENAME OUT> instr");
    printf(" rstll %x, free_regs %d\n", rename_stall(), FL.num_free_objs());

    instr_in()->print("RENAME IN> instr");
    printf("\n");
    if(debug_mask & 0x80) {
      for (uint32_t context = 0; context < num_contexts; ++context) {
        printf("context %u rat (active_rat %u, index %u)\n", context, active_rat[context], index(context, active_rat[context]));
        if(debug_mask & 0x04000) the_rat[index(context, active_rat[context])].print();
        else the_rat[index(context, active_rat[context])].print_map();
        printf("Retirement rat:\n");
        if(debug_mask & 0x04000) the_rrat[context].print();
        else the_rrat[context].print_map();
        printf("\n");
      }
    }

    if(debug_mask & 0x40000) {
      printf("reference counts:\n");
      for (uint32_t x = 0; x < num_phys_regs; x += 8) {
        for (uint32_t y = 0; (y < 8) && (x+y < num_phys_regs); ++y)
          printf("%3u->%3u  ", x+y, reg_access_count[x+y]);
        printf("\n");
      }

      FL.print();
    }
  }
}

void rename_stage::bullet_proof_check() {
  // debug code to ensure that no physical registers reside in both the free list and a RAT
  bool error_cond = false;
  for(uint32_t context = 0; context < num_contexts; ++context) {
    for(uint32_t arch_reg = 0; arch_reg < NUM_ARCH_REGS; ++arch_reg) {
      uint32_t phys_reg = the_rat[index(context, active_rat[context])].get_phys_reg(arch_reg);
      deque<uint32_t>::iterator q_itr;
      phys_reg = the_rrat[context].get_phys_reg(arch_reg);
        q_itr = find(FL.queue.begin(), FL.queue.end(), phys_reg);
        if(q_itr != FL.queue.end()) {
          printf("RENAME>  ERROR: physical register $%u in free list and context %u's RRAT\n", phys_reg, context); 
          error_cond = true;
        }
    }
  }
  
  if(FL.duplicate_check()) error_cond = true;
  
  if(error_cond) {
    printf("Cycle: %llu\n", cycle_count);
    exit(1);
  }
}


inline void rename_stage::handle_simpanic(simpanic_packet the_simpanic_packet) {
  uint32_t starting_context = the_simpanic_packet.context;
  rename_stall = false;
  p4rename_stall = false;
  free_reg_stall = false;

  //frob RAT for all contexts, rebuild FL
  initialize_rats(starting_context);
}


inline bool rename_stage::can_rename_instr()
{
  //NOTE-woley-2005.07.09- this breaks valgrind if the instr_in()->noop 
  // is not there:  do not remove it
  return !(instr_in()->noop || 
           simpanic_in().valid    || //if simpanic is valid, drop instruction, irrespective of what context it belongs to
           branch_exec().mispredicted_context(instr_in()->context)    ||
           (syscall_exec().valid && syscall_exec().context == instr_in()->context) ||
           mem_access_exec().mispredicted_context(instr_in()->context));
}

inline void rename_stage::flush_input_instr() 
{
  decoder::instr_h temp_instr = decoder::noop;
  instr_out = temp_instr;  
}


inline void rename_stage::rename_input_instr()
{

  decoder::instr_h the_instr = instr_in();

  // set the instruction's rat checkpoint
  the_instr->rat_checkpt = active_rat[the_instr->context];

  // for a load, create a checkpoint before renaming the instruction
  if (the_instr->is_load) {
    the_rat[index(the_instr->context, (active_rat[the_instr->context] + 1) % rob_size)] = the_rat[index(the_instr->context, active_rat[the_instr->context])];
    active_rat[the_instr->context] = (active_rat[the_instr->context] + 1) % rob_size;
  }


  //do the renaming of source registers
  the_instr->phys_t_reg = the_rat[index(the_instr->context, active_rat[the_instr->context])].get_phys_reg(the_instr->arch_t_reg);
  the_instr->phys_s_reg = the_rat[index(the_instr->context, active_rat[the_instr->context])].get_phys_reg(the_instr->arch_s_reg);
  
  // get free reg and rename the dest register
  if (the_instr->arch_dest_reg != 0) {
    get_register(the_instr->phys_dest_reg);
    the_rat[index(the_instr->context, active_rat[the_instr->context])].set_phys_reg(the_instr->arch_dest_reg,the_instr->phys_dest_reg);
  }
  else {
    // pass thru if arch dest reg is 0
    the_instr->phys_dest_reg = the_instr->arch_dest_reg;
  }
  
  // for a branch or store, create a checkpoint after renaming the instruction
  if (the_instr->is_branch || the_instr->is_jump  || the_instr->is_store) {
    the_rat[index(the_instr->context, (active_rat[the_instr->context] + 1) % rob_size)] = the_rat[index(the_instr->context, active_rat[the_instr->context])];
    active_rat[the_instr->context] = (active_rat[the_instr->context] + 1) % rob_size;
  }
 
  // write the renamed instruction to the output (goes to scoreboard and rob)
  instr_out = the_instr;
}

inline void rename_stage::process_retiring_instr() {
    
  if ((retiring_instr()->completed) && (retiring_instr()->arch_dest_reg != 0)) {
    typedef vector<pair<uint32_t, uint32_t> > pred_list_t;
    typedef map<uint32_t, pred_list_t> pred_map_t;

    if (retiring_instr()->canceled) {
      uint32_t phys_dest_reg = retiring_instr()->phys_dest_reg;
      free_register(phys_dest_reg);
      if(debug_mask & 0x0004) printf("RENAME> POINT A: retiring, cancelled instruction frees register %u\n", phys_dest_reg);
    }
    else {
      // instruction completed and is not canceled: normal retirement
      // this instruction could be a mispredicted call - that's okay
      uint32_t c = retiring_instr()->context;
      uint32_t arch_dest_reg = retiring_instr()->arch_dest_reg;
      uint32_t phys_dest_reg = retiring_instr()->phys_dest_reg;

      // remove phys dest reg from RRAT and put on free list
      free_register(the_rrat[c].get_phys_reg(arch_dest_reg) );
      if(debug_mask & 0x0004) printf("Retiring, completed instruciton frees register %u\n", the_rrat[c].get_phys_reg(arch_dest_reg) );

      // update the RRAT with the new phys reg value
      the_rrat[c].set_phys_reg(arch_dest_reg, phys_dest_reg);
    }
  }
}

inline void rename_stage::process_misspeculation()
{
  // NOTE: Jump-and-link instructions (jal and jalr) can both mispredict
  // and write back to a register (always architectural register 31).
  // Thus, during a cycle that a mispredicted jump-and-link retires both
  // branch_exec().mispredict and retiring_instr->completed will be true.
  // [mispredicted calls are also completed; mispredicted returns are NOT also completed]
  // on misprediction or misspeculation, reset RAT to checkpoint

  if (alpha_renaming) {
    //simpanics have precedence over everything else
    if(simpanic_in().valid) {
      simpanic_packet the_simpanic = simpanic_in();
      handle_simpanic(the_simpanic); //function restores all RAT tables, builds free list, etc
    }
    else {
      if (mem_access_exec().misspeculation) {
        active_rat[mem_access_exec().recovery_context] = mem_access_exec().rat_checkpt;
      }
      else if (branch_exec().mispredict) {
        active_rat[branch_exec().context] = branch_exec().rat_checkpt;
      }
      else if (syscall_exec().valid) {
        active_rat[syscall_exec().context] = syscall_exec().rat_checkpt;
      }
    }
  }
}


inline void rename_stage::stall_if_needed() 
{
  // THE RENAMER STALLS WHEN IT GETS CLOSE TO RUNNING OUT OF
  // REGISTERS IN ITS FREELIST.  THIS SIGNAL TELLS THE FETCH AND
  // DECODE UNITS TO STALL.  NOTE THAT THIS SIGNAL DOESN'T GET TO
  // THE DECODER UNTIL *AFTER* THE NEXT CLOCK EDGE, SO ONE MORE
  // INSTRUCTION IS GOING TO ARRIVE ON THE NEXT CLOCK EDGE, AND YOU
  // BETTER HAVE THE RESOURCES TO DEAL WITH IT.

  free_reg_stall = (bool)(FL.num_free_objs() <= 1);
    
  if (!alpha_renaming) {
    if(simpanic_in().valid) {
      simpanic_packet the_simpanic = simpanic_in();
      handle_simpanic(the_simpanic);
    }
    if (mem_access_exec().misspeculation) {
      if (!p4rename_stall || ((mem_access_exec().recovery_ts) < earliest_recovery_ts)) {
        earliest_recovery_ts = mem_access_exec().recovery_ts;
        active_rat[mem_access_exec().recovery_context] = mem_access_exec().rat_checkpt;
      }
      p4rename_stall = true;
    }
    else if (branch_exec().mispredict) {
      if (!p4rename_stall || (branch_exec().instr_num < earliest_recovery_ts)) {
        earliest_recovery_ts = branch_exec().instr_num;
        active_rat[branch_exec().context] = branch_exec().rat_checkpt;
      }
      p4rename_stall = true;
    }
    else if(syscall_exec().valid) {
      if(!p4rename_stall || (syscall_exec().instr_num < earliest_recovery_ts)) {
        earliest_recovery_ts = syscall_exec().instr_num;
        active_rat[syscall_exec().context] = syscall_exec().rat_checkpt;
      }
    }

    if (retiring_instr()->instr_num == earliest_recovery_ts) {
      uint32_t retiring_context = retiring_instr()->context;
      the_rat[index(retiring_context, active_rat[retiring_context])] = the_rrat[retiring_instr()->context] = 
      p4rename_stall = false;
    }
  }

  rename_stall = (free_reg_stall || p4rename_stall);
}


inline void rename_stage::handle_stall()
{
  // stall detected: in all cases this should produce a no-op on the output
  decoder::instr_h temp_instr = decoder::noop;
  instr_out = temp_instr;
}


void rename_stage::recalc() {
  
  print_debug_info();

  //make sure no phys_reg leaks 
  if(bullet_proof_check_cycle) 
    bullet_proof_check();

  // THIS PART OF THE CODE HANDLES THE INSTRUCTION ARRIVING FROM THE DECODER:
  if (scoreboard_stall() || rename_stall() || (rob_stall()[0])) {
    // stall detected: in all cases this should produce a no-op on the output
    handle_stall();
  }
  else {
    // no stall detected
    if (!can_rename_instr() ) {
      // state: no stall detected, but misprediction or misspeculation detected in the context
      //  that owns the instruction in the input latch
      // action: flush the instruction from the mispredicting/misspeculating context
      flush_input_instr();
    }
    else {
      // state: no stall detected, and no misprediction/misspeculation detected in the context
      //  that owns the instruction in the input latch. input latch was NOT a nop
      // action: rename the instruction and put the renamed instruction in
      //  the output latch
      
      // rename the arriving instruction and put it in the output latch
      rename_input_instr();
    }
  }

  //Handle instruction retiring from the ROB
  process_retiring_instr();

  //handle state changes because of misprediction
  process_misspeculation();

  //assert rename_stall if required
  stall_if_needed();
}


void rename_stage::account_phys_regs(vector<uint32_t>& phys_regs) const {
  for (uint8_t context = 0; context < num_contexts; context++) {
    for (uint32_t arch_reg = 0; arch_reg < NUM_ARCH_REGS; arch_reg++) {
      phys_regs[the_rat[index(context, active_rat[context])].get_phys_reg(arch_reg)] = 1;
      phys_regs[the_rrat[context].get_phys_reg(arch_reg)] = 1;
    }
  }
  
  for (uint32_t fl_slot = 0; fl_slot < FL.queue.size(); fl_slot++)
    phys_regs[FL.queue[fl_slot]] = 1;
}

