#ifndef EXEC_UNIT_H_GUARD
#define EXEC_UNIT_H_GUARD

#include "decode.h"
#include "circuit.h"
#include "delay.h"
#include "sparse_memory.h"
#include "packets.h"
#include "bitops.h"
#include "execute.h"
#include "tag_file.h"
#include "rob.h"
#include "cache.h"
#include "globals.h"
#include "lsq_pf.h"

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <deque>
using namespace std;

// Command line global variables
// exec_unit stage               : debug mask is 0x0040
// show store addresses and data : debug_mask is 0x0800
// load addresses and data       : debug_mask is 0x1000
// print phys reg file           : debug_mask is 0x01000000 

extern uint32_t debug_mask;
extern uint32_t cache_line_psize;
extern uint32_t num_phys_regs;
extern uint32_t rob_size;
extern uint64_t cycle_count;
extern uint32_t superpipeline_factor;
extern bool     alpha_renaming;

extern uint32_t stq_entries;
extern uint32_t ldq_entries;

// memory subsystem stats
extern uint32_t exec_load_cnt;
extern uint32_t exec_store_cnt;
extern uint64_t exec_load_from_cache;
extern uint64_t lsq_store_stall_cnt;
extern uint64_t lsq_load_stall_cnt;

// cache parameters
extern uint32_t dL1_lsize;
extern uint32_t dL1_assoc;
extern uint32_t dL1_lines;
extern uint32_t dL1_delay;

extern uint32_t L2_lsize;
extern uint32_t L2_assoc;
extern uint32_t L2_lines;
extern uint32_t L2_delay;

class execution_unit : circuit, public state_semantics
{
 public:
  execution_unit(sparse_memory* mem, cache<uint64_t> &a_dcache, cache<uint64_t> &a_l2_cache) :
  circuit(),
    // inputs
    instr_in(),
    mem_access_rob(),
    branch_rob(),
    retiring_instr(),
    simpanic_in(),

    // outputs
    writeback_bus(),
    store_bus_rob(),
    store_bus_sched(),
    branch_output(),
    mem_access_output(),
    syscall_output(), 
    lsq_stall(vector<int>(num_contexts, false)),

    // data structures
    the_datapath(this),
    reg_file(num_phys_regs, 0),
    the_mem(mem),
    the_stq(stq_entries * num_contexts), 
    the_ldq(ldq_entries * num_contexts, &the_stq), 
    branch_control(),
    mem_control(),
    
    hits(0),
    misses(0),
    l2hits(0),
    l2misses(0),    

    blank_branch_packet(),
    result(0),
    the_pc(0),
    exec_mispred_detected(false),
    earliest_recovery_ts(0ULL),
    the_cache(a_dcache),
    l2_cache(a_l2_cache)
  {  }

  ~execution_unit() {  }

  // inputs - execution
  inport<decoder::instr_h>  instr_in;           // the instruction to execute
  inport<mem_access_packet> mem_access_rob;     // memory access info for retiring instruction
  inport<branch_packet>     branch_rob;
  inport<decoder::instr_h>  retiring_instr;
  inport<simpanic_packet> simpanic_in; //retire check failures

  // outputs
  result_bus<bus_packet>          writeback_bus;     // register data
  result_bus<store_bus_packet>    store_bus_rob;     // store data
  result_bus<store_bus_packet>    store_bus_sched;   // store data
  statereg<branch_packet>         branch_output;     // branch unit
  statereg<mem_access_packet>     mem_access_output; // load/store unit
  statereg<syscall_exec_packet>   syscall_output;    // was a syscall executed?
  statereg<vector<int> >          lsq_stall;

  // data structures
  datapath the_datapath;
  vector<uint64_t> reg_file;
  sparse_memory* the_mem;
  stq the_stq;
  ldq the_ldq;
  branch_packet branch_control;
  mem_access_packet mem_control;

  //simpanic state
  bool simpanic_wait_state;
  //when rob asserts a simpanic, exec_unit needs to restore the reg_file.
  //however, exec_unit must wait for 1 cycle before restoring reg_file.
  
  // stats
  uint64_t hits;
  uint64_t misses;
  uint64_t l2hits;
  uint64_t l2misses;

private:
  branch_packet blank_branch_packet;

  uint64_t              result;
  uint32_t              the_pc;
  decoder::instr_h      the_instr;
  branch_packet         branch_result;
  mem_access_packet     mem_access_result;
  store_packet          store_result;
  bool                  exec_mispred_detected;
  uint64_t              earliest_recovery_ts;

  cache<uint64_t>       & the_cache;
 public:
  // make caches public so others can access them if necessary,
  // primarily needed so ifetch can access the l2
  cache<uint64_t>       & l2_cache;

  // external intialization functions
  void reg_init(datapath_reg_init* dri, uint32_t context=0) {
    reg_file[29 + context*NUM_ARCH_REGS] = dri->reg_29;
    reg_file[ 4 + context*NUM_ARCH_REGS] = dri->reg_4;
    reg_file[ 5 + context*NUM_ARCH_REGS] = dri->reg_5;
  }


  //when fast-forward is enabled, the initialized register file is larger than just (R29, R4, R5)
  void reg_init(vector<uint64_t> x_reg_file, uint32_t x_context=0) {
    for(int i = 1; i<NUM_ARCH_REGS; i++) 
      reg_file[i + x_context*NUM_ARCH_REGS] = x_reg_file[i];
  }

 private:

  uint64_t load_cache(uint32_t vaddr, uint32_t context, uint8_t size, bool is_signed);
  uint64_t load_lsq(uint32_t vaddr, uint32_t context, uint8_t size, bool is_signed, uint64_t timestamp, bool& is_cache_access);
  uint64_t load(uint32_t vaddr, uint32_t context, uint8_t size, bool is_signed);
  uint64_t loadLSQ(uint32_t vaddr, uint32_t context, uint8_t size, bool is_signed);

  void store(uint64_t data, uint32_t vaddr, uint32_t context, uint8_t size);
  void storeLSQ(uint64_t data, uint32_t vaddr, uint32_t context, uint8_t size);

  void restore_reg_file_on_simpanic(vector<uint64_t> x_reg_file);
  void print_debug_info();
  void recalc();

  uint64_t s_op64(decoder::instr_h instr) {
    return reg_file[instr->phys_s_reg];
  }
  uint64_t t_op64(decoder::instr_h instr) {
    return reg_file[instr->phys_t_reg];
  }
  uint64_t& dest(decoder::instr_h instr) {
    return result;
  }
  uint32_t& program_counter() {
    return the_pc;
  }

  uint64_t load_int8(uint32_t vaddr, uint32_t context=0) {
    return load(vaddr, context, 1, true);
  }

  uint64_t load_uint8(uint32_t vaddr, uint32_t context=0) {
    return load(vaddr, context, 1, false);
  }

  uint64_t load_int16(uint32_t vaddr, uint32_t context=0) {
    return load(vaddr, context, 2, true);
  }

  uint64_t load_uint16(uint32_t vaddr, uint32_t context=0) {
    return load(vaddr, context, 2, false);
  }

  uint64_t load_int32(uint32_t vaddr, uint32_t context=0) {
    return load(vaddr, context, 4, true);
  }

  uint64_t load_uint32(uint32_t vaddr, uint32_t context=0) {
    return load(vaddr, context, 4, false);
  }

  uint64_t load_uint64(uint32_t vaddr, uint32_t context=0) {
    return load(vaddr, context, 8, false);
  }

  void store_uint8(uint8_t data, uint32_t vaddr, uint32_t context=0) {
    store(data, vaddr, context, 1);
  }

  void store_uint16(uint16_t data, uint32_t vaddr, uint32_t context=0) {
    store(data, vaddr, context, 2);
  }

  void store_uint32(uint32_t data, uint32_t vaddr, uint32_t context=0) {
    store(data, vaddr, context, 4);
  }

  void store_uint64(uint64_t data, uint32_t vaddr, uint32_t context=0) {
    store(data, vaddr, context, 8);
  }

  void memcpy_to_host(void* dest, const uint32_t src, size_t n, uint32_t context=0) {
    the_mem[context].memcpy_to_host(dest, src, n);
  }
  void memcpy_from_host(uint32_t dest, const void* src, size_t n, uint32_t context=0) {
    the_mem[context].memcpy_from_host(dest, src, n);
  }
  void strcpy_to_host(char* dest, const uint32_t src, uint32_t context=0) {
    the_mem[context].strcpy_to_host(dest, src);
  }
  void strcpy_from_host(uint32_t dest, const char* src, uint32_t context=0) {
    the_mem[context].strcpy_from_host(dest, src);
  }
  void emulate_syscall(uint32_t cmd_addr, uint32_t context=0) {
    the_mem[context].emulate_syscall(cmd_addr);
  }

  void halt() {
    // noop
  }

  void zero_reg_0() {
    reg_file[0] = 0;
  }

  void print_err_msg(char *s) {
    // noop (bogus instructions are often sent down pipeline speculatively)
  }
};

#endif /* EXEC_UNIT_H_GUARD */
