#include "exec_unit.h"
#include "pretty_print.h"
#include <algorithm>            // for max

uint64_t execution_unit::load_cache(uint32_t vaddr, uint32_t context, uint8_t size, bool is_signed) {
  if (is_signed) {
    switch (size) {
    case 1:  return (the_mem[context].load_int8(vaddr));
    case 2:  return (the_mem[context].load_int16(vaddr));
    case 4:  return (the_mem[context].load_int32(vaddr));
    default: printf("signed load size must be 1, 2, or 4 bytes\n"); exit(1);
    }
  }
  else {
    switch (size) {
    case 1:  return (the_mem[context].load_uint8(vaddr));
    case 2:  return (the_mem[context].load_uint16(vaddr));
    case 4:  return (the_mem[context].load_uint32(vaddr));
    case 8:  return (the_mem[context].load_uint64(vaddr));
    default: printf("unsigned load size must be 1, 2, 4, or 8 bytes\n"); exit(1);
    }
  }
}

uint64_t execution_unit::load_lsq(uint32_t vaddr, uint32_t context, uint8_t size, bool is_signed, uint64_t timestamp, bool& is_cache_access) {
  if (is_signed) {
    switch (size) {
    case 1: return (the_stq.load_int8(timestamp, vaddr, context, size, &the_mem[context], is_cache_access));
    case 2: return (the_stq.load_int16(timestamp, vaddr, context, size, &the_mem[context], is_cache_access));
    case 4: return (the_stq.load_int32(timestamp, vaddr, context, size, &the_mem[context], is_cache_access));
    default: printf("signed load size must be 1, 2, or 4 bytes\n"); exit(1);
    }
  }
  else {
    switch (size) {
    case 1: return (the_stq.load_uint8(timestamp, vaddr, context, size, &the_mem[context], is_cache_access));
    case 2: return (the_stq.load_uint16(timestamp, vaddr, context, size, &the_mem[context], is_cache_access));
    case 4: return (the_stq.load_uint32(timestamp, vaddr, context, size, &the_mem[context], is_cache_access));
    case 8: return (the_stq.load_uint64(timestamp, vaddr, context, size, &the_mem[context], is_cache_access));
    default: printf("unsigned load size must be 1, 2, 4, or 8 bytes\n"); exit(1);
    }
  }
}

uint64_t execution_unit::loadLSQ(uint32_t vaddr, uint32_t context, uint8_t size, bool is_signed) {
  the_ldq.insert(the_instr->ldq_slot, vaddr, size, the_instr->program_counter,
		 the_instr->reorder_slot, the_instr->rat_checkpt,
		 the_instr->prev_pc);
  return load_lsq(vaddr, context, size, is_signed, the_instr->instr_num, mem_access_result.cache_access);
}

uint64_t execution_unit::load(uint32_t vaddr, uint32_t context, uint8_t size, bool is_signed) {
  if (debug_mask & 0x0040) printf("EXEC> LOAD addr %08x ctext %u size %u signed %u\n", vaddr, context, size, is_signed);

  // if address is unaligned, this is a wrong path instruction
  // alignment errors are bad for sbc, tsc
  if ((vaddr % size) != 0)
    return (uint64_t)0;

  exec_load_cnt++;
  mem_access_result.mem_address = vaddr;
  mem_access_result.size = size;

  return loadLSQ(vaddr, context, size, is_signed);
}

void execution_unit::storeLSQ(uint64_t data, uint32_t vaddr, uint32_t context, uint8_t size) {
  bool conflict = false;
  ldq_packet res;

  the_stq.insert(the_instr->stq_slot, vaddr, data, size);
  conflict = the_ldq.conflict_check(vaddr, size, context, the_instr->instr_num, res);

  if (conflict) {
    // recovery begins at the pc of the conflicting load
    if(debug_mask & 0x0040) printf("EXEC> conflict with pc %x of context %d \n", 
				   res.instr_pc, res.context);
    mem_access_result.recovery_pc        = res.instr_pc;
    mem_access_result.recovery_ts        = res.inum;
    mem_access_result.recovery_context   = res.context;
    mem_access_result.rat_checkpt        = res.rat_checkpt;
    mem_access_result.prev_pc            = res.prev_pc;
    mem_access_result.misspeculation = true;      
  }
}

void execution_unit::store(uint64_t data, uint32_t vaddr, uint32_t context, uint8_t size) {
  if (debug_mask & 0x0040) printf("EXEC> STORE data: %016llx addr: %08x ctext: %u size: %u\n", data, vaddr, context, size);

  // if address is unaligned, this is a wrong path instruction
  // alignment errors are bad for sbc, tsc
  if ((vaddr % size) != 0)
    return;

  exec_store_cnt++;
  mem_access_result.mem_address = vaddr;
  mem_access_result.size = size;

  storeLSQ(data, vaddr, context, size);

  return;
}


void execution_unit::restore_reg_file_on_simpanic(vector<uint64_t> x_reg_file) {
  for(uint32_t arch_reg=1; arch_reg<NUM_ARCH_REGS; arch_reg++)
    reg_file[arch_reg] = x_reg_file[arch_reg];
}

void execution_unit::print_debug_info() {
  if (debug_mask & 0x0040) {
    printf("EXEC> brtopc %x brfrpc %x brmis %x isbranch %x btaken %x roslot %d\n",
           branch_output().correct_target, branch_output().instr_pc, branch_output().mispredict, 
           branch_output().is_branch, branch_output().taken, branch_output().reorder_slot);
    printf("      maisa %x maiss %x maaddr %x mapc %x mamis %x marep %x wbreg %u wbdata %llx wbctext %x\n",
           mem_access_output().is_access, mem_access_output().is_store, mem_access_output().mem_address,
           mem_access_output().instr_pc, mem_access_output().misspeculation, mem_access_output().replay,
           writeback_bus().reg_tag, writeback_bus().data, writeback_bus().context);

    the_ldq.print();
    the_stq.print();

    instr_in()->print("EXEC INSTR_IN>");
    printf("\n\t\t");
    pretty_print(instr_in());
    printf("\n");
  }

  if (debug_mask & 0x01000000) {
    printf("PHYS_REG_FILE>\n");
    //FIXME:
    for(uint32_t i=0; i<num_phys_regs; i++) {
      printf("# %u V %llu   ", i, reg_file[i]);
      if(i%6==0)
        printf("\n");
    }
  }
}


void
execution_unit::recalc() {

  print_debug_info();

  if (writeback_bus().reg_tag != 0) {
    if (debug_mask & 0x00000040)
      printf("EXEC> writeback: $%d <- %llx\n", writeback_bus().reg_tag, writeback_bus().data);
    reg_file[writeback_bus().reg_tag] = writeback_bus().data;
  }

  // any load/store is eligible for retirement, even if the load/store was canceled in the rob
  if (retiring_instr()->is_store)
    the_stq.commit_head(retiring_instr()->context, retiring_instr()->instr_num);
  else if (retiring_instr()->is_load)
    the_ldq.commit_head(retiring_instr()->context, retiring_instr()->instr_num);

  // simpanic triggers flush of the entire machine (every instr from every context)

  // flush the writeback bus and the lsq if the exec unit completed a mispred/misspecx
  if(simpanic_in().valid) {
    //for a simpanic, clobber all contexts lsq, stq. Also remove canceled instructions from queues
    for(uint32_t current_context = 0; current_context< num_contexts; current_context++) { 
      writeback_bus.flush(current_context, 0);
      store_bus_rob.flush(current_context, 0);
      store_bus_sched.flush(current_context, 0);

      the_stq.clobber(current_context, 0);
      the_ldq.clobber(current_context, 0);

      // you won't get any retired instructions from rob, so move ldq/stq head beyond canceled instructions
      // (with simpanic, all rob entries are flushed in a jiffy)
      the_stq.flush_head_canceled(current_context);
      the_ldq.flush_head_canceled(current_context);
    }
    
    if (!alpha_renaming) {
      if (!exec_mispred_detected || (simpanic_in().instr_num < earliest_recovery_ts)) {
        earliest_recovery_ts = simpanic_in().instr_num;
        exec_mispred_detected = true;
      }
    }
    restore_reg_file_on_simpanic(simpanic_in().reg_file);
  }

  if (branch_output().mispredict) {
    writeback_bus.flush(branch_output().context, branch_output().instr_num + 1);

    store_bus_rob.flush(branch_output().context, branch_output().instr_num + 1);
    store_bus_sched.flush(branch_output().context, branch_output().instr_num + 1);
    the_stq.clobber(branch_output().context, branch_output().instr_num);
    the_ldq.clobber(branch_output().context, branch_output().instr_num);

    if (!alpha_renaming) {
      if (!exec_mispred_detected || (branch_output().instr_num < earliest_recovery_ts)) {
        earliest_recovery_ts = branch_output().instr_num;
        exec_mispred_detected = true;
      }
    }
  }

  if (mem_access_output().misspeculation) {
    writeback_bus.flush(mem_access_output().recovery_context, mem_access_output().recovery_ts);
    store_bus_rob.flush(mem_access_output().recovery_context, mem_access_output().recovery_ts);
    store_bus_sched.flush(mem_access_output().recovery_context, mem_access_output().recovery_ts);
    the_stq.clobber(mem_access_output().recovery_context, mem_access_output().recovery_ts);
    the_ldq.clobber(mem_access_output().recovery_context, mem_access_output().recovery_ts);

    if (!alpha_renaming) {
      if (!exec_mispred_detected || (mem_access_output().recovery_ts < earliest_recovery_ts)) {
        earliest_recovery_ts = mem_access_output().recovery_ts;
        exec_mispred_detected = true;
      }
    }
  }

  if(syscall_output().valid) { 
    writeback_bus.flush(syscall_output().context, syscall_output().instr_num);
    store_bus_rob.flush(syscall_output().context, syscall_output().instr_num);
    store_bus_sched.flush(syscall_output().context, syscall_output().instr_num);
    the_stq.clobber(syscall_output().context, syscall_output().instr_num);
    the_ldq.clobber(syscall_output().context, syscall_output().instr_num);

    if (!alpha_renaming) {
      if (!exec_mispred_detected || (syscall_output().instr_num < earliest_recovery_ts)) {
        earliest_recovery_ts = syscall_output().instr_num;
        exec_mispred_detected = true;
      }
    }
  }

  // branch information
  branch_result = blank_branch_packet;

  branch_result.instr_pc          = instr_in()->program_counter;
  branch_result.ra_stack_ptr      = instr_in()->ra_stack_ptr;
  branch_result.ra_stack_head     = instr_in()->ra_stack_head;
  branch_result.reorder_slot      = instr_in()->reorder_slot;
  branch_result.sb_slot           = instr_in()->sb_slot;
  branch_result.context           = instr_in()->context;
  branch_result.rat_checkpt       = instr_in()->rat_checkpt;
  branch_result.instr_num         = instr_in()->instr_num;

  // store information
  store_result.is_store     = false;
  store_result.reorder_slot = instr_in()->reorder_slot;
  store_result.sb_slot      = instr_in()->sb_slot;
  store_result.context      = instr_in()->context;

  // load/store information
  mem_access_result.is_access      = false;
  mem_access_result.is_store       = false;
  mem_access_result.cache_access   = false;
  mem_access_result.mem_address    = 0xffffffff;
  mem_access_result.instr_pc       = instr_in()->program_counter;
  mem_access_result.instr_num      = instr_in()->instr_num;
  mem_access_result.prev_pc        = instr_in()->prev_pc;
  mem_access_result.misspeculation = false;
  mem_access_result.replay         = false; // will be set to true if instr must be replayed due to conflicts in bypass/timestamp caches
  mem_access_result.context        = instr_in()->context;
  mem_access_result.sb_slot        = instr_in()->sb_slot;
  mem_access_result.rob_slot       = instr_in()->reorder_slot;
  mem_access_result.rat_checkpt    = instr_in()->rat_checkpt;
  mem_access_result.in_sbc         = false;
  mem_access_result.in_tsc         = false;
  mem_access_result.lsq_search_penalty = false;

  // default load/store recovery to this instruction:
  // a load/store may recover to another instruction
  mem_access_result.recovery_context = instr_in()->context;
  mem_access_result.recovery_pc = instr_in()->program_counter;
  mem_access_result.recovery_ts = instr_in()->instr_num;

  //syscall information
  syscall_output = syscall_exec_packet();

  the_instr = instr_in();
  uint32_t miss_latency = 0;
  if ( !(instr_in()->noop) && 
       !(simpanic_in().valid) &&
       !(branch_output().mispredict && (instr_in()->instr_num > branch_output().instr_num) && (instr_in()->context == branch_output().context)) &&
       !(mem_access_output().misspeculation && (instr_in()->instr_num >= mem_access_output().recovery_ts) && (instr_in()->context == mem_access_output().recovery_context)) &&
       !(syscall_output().valid && (instr_in()->instr_num >= syscall_output().instr_num) && (instr_in()->context == syscall_output().context)))
  {

    result = 0;
    the_pc = instr_in()->program_counter;

    if (debug_mask & 0x0040) 
      printf("EXEC> executing instruction:\n");

    the_datapath.execute(instr_in()); // calculate the result

    // cache latency model
    // L1 cache hits/misses not counted for loads/stores that only access the LSQ
    // Added LSQ search penalty
    if (instr_in()->is_load || instr_in()->is_store) {

      uint32_t way = 0;
      uint64_t cycle_ready = 0;
      uint32_t l2way = 0;
      uint64_t l2cycle_ready = 0;
      bool l2hit = 0;
      
      bool hit = the_cache.check(instr_in()->vaddr, way);
      if (hit) {
	//get the cycle ready, get_value updates lru
	cycle_ready = the_cache.get_value(instr_in()->vaddr, way);
      }
      else {
	// look in L2
	l2hit = l2_cache.check(instr_in()->vaddr, l2way);
	if (l2hit) {
	  l2cycle_ready = l2_cache.get_value(instr_in()->vaddr, l2way);
	  if (l2cycle_ready <= cycle_count)
	    l2cycle_ready = cycle_count;
	  cycle_ready = l2cycle_ready + dL1_delay * superpipeline_factor;
	} 
	else {
	  cycle_ready = cycle_count + L2_delay * superpipeline_factor;
	  //l2_cache.conditional_update_value(instr_in()->vaddr, cycle_ready, L2_delay);
	  l2_cache.update_value(instr_in()->vaddr, cycle_ready);
	}

	// count real L2 cache hits
	if (cycle_ready > (cycle_count + dL1_delay * superpipeline_factor))
	  l2misses++;
	else
	  l2hits++;

	//the_cache.conditional_update_value(instr_in()->vaddr, cycle_ready, dL1_delay);
	the_cache.update_value(instr_in()->vaddr, cycle_ready);
      }
      
      // count real L1 cache hits
      if (cycle_ready > cycle_count)
	misses++;
      else
	hits++;

      // int64_t cache_latency = max((int64_t)cycle_ready-(int64_t)cycle_count, 0LL);
      bool real_hit = false;
      uint32_t cache_latency = 0;
      if(cycle_ready > cycle_count) {
	cache_latency = cycle_ready - cycle_count;
      }
      else {
	real_hit = true;
      }

      if(debug_mask & 0x0040) {
	printf("EXEC> addr: %x L1_hit: %x L1_way: %d  L2_hit: %x L2_way: %d real_hit: %x cycle_ready: %llu\n", instr_in()->vaddr, hit, way, l2hit, l2way, real_hit, cycle_ready);
      }

      miss_latency = (uint32_t)cache_latency;
    }

    if (instr_in()->arch_dest_reg != 0) {
      // put the result on the result bus
      size_t latency = instr_in()->bus_latency + miss_latency;
      bus_packet the_packet(instr_in()->arch_dest_reg, instr_in()->phys_dest_reg,result, instr_in()->sb_slot, instr_in()->reorder_slot, instr_in()->context, instr_in()->instr_num);

      // arbitrate for writeback bus:
      while (writeback_bus.is_in_use(latency-1)) latency++;
      if (debug_mask & 0x00000040)
	printf("\t\t$%d(arch%d) will get %llx in %d\n", instr_in()->phys_dest_reg, instr_in()->arch_dest_reg, result, latency-1);
      writeback_bus.write(the_packet, latency - 1);
    }

    if (instr_in()->is_store) {
      // put the store on the store buses
      size_t rob_latency   = instr_in()->bus_latency;
      size_t sched_latency = rob_latency;

      store_bus_packet the_packet(instr_in()->sb_slot, instr_in()->reorder_slot, instr_in()->context, instr_in()->instr_num);

      // arbitrate for buses
      while (store_bus_rob.is_in_use(rob_latency-1)) rob_latency++;
      while (store_bus_sched.is_in_use(sched_latency-1)) sched_latency++;

      if (debug_mask & 0x0040)
	printf("\t\tstore %llu will be marked complete in the scheduler in %d cycles\n", instr_in()->instr_num, sched_latency-1);
      if (debug_mask & 0x0040)
	printf("\t\tstore %llu will be marked complete in the rob in %d cycles\n", instr_in()->instr_num, rob_latency-1);

      store_bus_rob.write(the_packet, rob_latency - 1);
      store_bus_sched.write(the_packet, sched_latency - 1);
    }

    // delay bus for memory disambiguation results
    // load/store information
    // these results include effects of LSQ/SBC/TSC mechanisms
    mem_access_result.is_access = (instr_in()->is_load || instr_in()->is_store);
    mem_access_result.is_store  = instr_in()->is_store;
    mem_access_result.l1_hit = (miss_latency == 0);
    mem_access_result.l2_hit = (miss_latency <= dL1_delay);

    // branch information
    branch_result.is_branch      = (instr_in()->is_branch || instr_in()->is_jump);
    branch_result.is_cond        = instr_in()->is_branch;
    branch_result.is_call        = instr_in()->is_link;
    branch_result.is_return      = (instr_in()->is_indirect_jump && (instr_in()->arch_s_reg == 31));
    branch_result.correct_target = the_pc;
    branch_result.mispredict     = (branch_result.correct_target != instr_in()->predicted_pc);
    branch_result.taken          = (branch_result.correct_target != (instr_in()->program_counter + 4));
    branch_result.predicted_pc   = instr_in()->predicted_pc;

    // load/store information
    mem_access_result.is_access = (instr_in()->is_load || instr_in()->is_store);
    mem_access_result.is_store  = instr_in()->is_store;
    // mem_access_result.mem_address is set in exec_unit.h
    // mem_access_result.misspeculation is set in exec_unit.h
    // mem_access_result.size is set in exec_unit.h

    // store information
    store_result.is_store = (instr_in()->is_store || instr_in()->is_syscall);

    // syscall information
    //if(instr_in()->is_syscall && alpha_renaming) //send informtation about retiring syscalls, if need to
    if(instr_in()->is_syscall) //send informtation about retiring syscalls, if need to
      syscall_output = syscall_exec_packet(instr_in());
  }

  branch_output  = branch_result;
  mem_access_output = mem_access_result;
}
