#include <stdio.h>
#include <inttypes.h>
#include <vector>
#include <unistd.h>
#include <fcntl.h>
#include "circuit.h"
#include "sparse_memory.h"
#include "decode.h"
#include "ifetch.h"
#include "decode_circuit.h"
#include "rename_stage.h"
#include "rob.h"
#include "scoreboard.h"
#include "exec_unit.h"

using namespace std;

// Debugging global variables decoded from command line
uint32_t debug_mask = 0;
uint32_t delay_debug_mask = 0;
bool delay_trace_on = 0;
bool trace_on = false;
bool trace_on_noarch = false;
bool arch_compare_on = true;
uint32_t tornado_warning = 1000;
uint64_t cycle_count = 0;

// Command line defined global variables
bool branch_pred_off = false;
//uint32_t num_phys_regs = 32;            //for hw1
uint32_t num_phys_regs = 64; 		// Number of physical registers for register renaming
uint32_t scoreboard_fifo_size = 64; 	// Number of entries in the combined replay buffer/issue window
uint32_t rob_size = 64;
bool branch_speculate = false;
uint32_t superpipeline_factor = 4;

// Cache/Memory 
uint32_t iL1_lsize = 128;
uint32_t iL1_assoc = 1;
uint32_t iL1_lines = 256;
uint32_t iL1_delay = 10;

uint32_t dL1_lsize = 64;
uint32_t dL1_assoc = 4;
uint32_t dL1_lines = 32;
uint32_t dL1_delay = 10;

uint32_t L2_lsize = 128;
uint32_t L2_assoc = 8;
uint32_t L2_lines = 512;
uint32_t L2_delay = 100;

// Usage/Help function called by command line decoding for statement
void
usage(void) 
{
  printf("\n\tECE412 SIMULATOR\n");
  printf("\nUsage:\n\tpipelined-model [-trace] [-trace_noarch] [-info] [-speculate]\n");
  printf("\t\t[-superpipeline] [-bp_off] [-phys_regs <n>] [-sbrd_fifo <n>] [-rob_size <n>]\n");
  printf("\t\t[-cache_lsize <t> <n>] [-cache_lines <t> <n>] [-cache_assoc <t> <n>] [-cache_delay <t> <n>]\n");
  printf("\t\t[-debug <mask>] [-debug_delay <n>] [-arch_compare] [-tornado_warn <n>] [-stop <n>] exec_name args\n");
  printf("\nOptional:  (In any order)\n\t-help,-h\tUsage text\n");
  printf("\t-trace\t\tOutputs the PC for every instruction\n\t\t\texecuted in order to compare with arch-model output\n");
  printf("\t-trace_noarch\tOutputs the PC value of every cycle in the exec unit\n");
  printf("\t-info\t\tDisplays sizes you are using for various components\n");
  printf("\t-speculate\tTurns on speculative issue past unresolved branches\n");
  printf("\t-superpipeline <n>\tChanges superpipeline factor\n");
  printf("\t-bp_off\t\tTurns off branch predictor so always predict not taken\n");
  printf("\t-phys_regs <n>\tSets number of physical regs for renaming (default = 64)\n");
  printf("\t-sbrd_fifo <n>\tSets number of scoreboard/replay fifo entries (default = 64)\n");
  printf("\t-rob_size <n>\tSets number of ROB entries (default = 64)\n");
  printf("\t-cache_lsize <t> <n>\tSets line size of cache (I1|D1|L2) to n Bytes \n\t\t\t\t(default: I1=128, D1=64, L2=128)\n");
  printf("\t-cache_lines <t> <n>\tSets number of lines of cache (I1|D1|L2) to n lines\n\t\t\t\t(default: I1=256, D1=32, L2=512)\n");
  printf("\t-cache_assoc <t> <n>\tSets associativity of cache (I1|D1|L2) to n ways \n\t\t\t\t(default: I1=1, D1=4, L2=8)\n");
  printf("\t-cache_delay <t> <n>\tSets delay to access next level for (I1|D1|L2) to n cycles \n\t\t\t\t(default: I1=10, D1=10, L2=100)\n");
  printf("\t-debug <mask>\tUse bit mask, mask[31:0], to turn on debugging output:\n");
  printf("\t\t\tA default setting of all zeroes gives no ouput\n");
  printf("\t\t0.\tDebug info for instruction fetch stage\n");
  printf("\t\t1.\tDebug info for decoder stage\n");
  printf("\t\t2.\tDebug info for register renaming state\n");
  printf("\t\t3.\tDebug info for scoreboard stage\n");
  printf("\t\t4.\tDebug info for the rob stage\n");
  printf("\t\t5.\tDebug info for register file stage\n");
  printf("\t\t6.\tDebug info for execution stage\n");
  printf("\t\t7.\tShow current and retirement RATs\n");
  printf("\t\t8.\tShow register renamer free list\n");
  printf("\t\t9.\tShow scoreboard fifo destination regs\n");
  printf("\t\t10.\tShow ROB destination regs\n");
  printf("\t\t11.\tShow store addresses and data\n");
  printf("\t\t12.\tShow load addresses and data\n");
  printf("\t\t13.\tPrint architectural dest reg and data for each retiring instruction\n");
  printf("\t\t14.\tDebug info for caches\n");
  printf("\t-debug_delay <n>\tWait until this cycle n before displaying debug info\n");
  printf("\t-arch_compare\tCompare retiring instructions with trace-driven results.\n");
  printf("\t-tornado_warn <n>\tWarn if n cycles are executed without instruction retirement.\n");
  printf("\t-stop <n>\tCycles to run before stoping simulation\n");
  printf("\nRequired:  (Must go last)\n");
  printf("\texec_name\tFile name of executeable to be run on simulator\n");
  printf("\targs\t\tThe arguements to be inputed into simulated program\n");
  printf("\nExamples:\n\tExample of using debug mask with full debug info:\n");
  printf("\t\tDebug mask = 111111111111111 in binary or 0x7fff in hex\n");
  printf("\t\tpipelined-model -debug 0x7fff mipsver 5\n");
  printf("\tExample of just turning on debug info for instruction fetch:\n");
  printf("\t\tpipelined-model -debug 1 mipsver 5\n");
  printf("\tA particularly useful mode is:\n");
  printf("\t\tpipelined-model -trace -debug 0x2000 mipsver 5\n");
  printf("\tthis should produce identical output to:\n");
  printf("\t\tarch-model -trace -debug 0x2000 mipsver 5\n");

  printf("\n");
  exit(0);
}

void input_error(char* error_input, char* error_option) {
  printf("\nERROR: the value %s is invalid input for the option \"%s\"\n",
         error_input, error_option);
  usage();

  exit(0);
}

void prog_comm_error(char* error_option) {
  printf("\nERROR: You have entered the option \"%s\" after already entering the\n",
         error_option);
  printf("\tsimulation program name\n");
  usage();

  exit(0);
}

/*** MEMORY INITIALIZATION ROUTINE ***/
void init_mem(sparse_memory& the_mem, char* sim_prog_argv[], int sim_prog_argc, uint32_t string_pos, uint32_t argv_pos) {
  uint32_t local_string_pos = string_pos;
  
  for (int i = 0; i < sim_prog_argc; i++) {
    the_mem.strcpy_from_host(local_string_pos, sim_prog_argv[i]);
    the_mem.store_uint32(local_string_pos, argv_pos + (4 * i));
    local_string_pos += (strlen(sim_prog_argv[i]) + 1);
  }
  // and the final null:
  the_mem.store_uint32(0, argv_pos + (4 * sim_prog_argc));
  
  return;
}

int
main(int argc, char* argv[])
{
  // **** Declarations for command line decoding
  char** sim_prog_argv = NULL;		// Simulated prog's argv
  int sim_prog_argc = 0;					// Simulated prog's argc
  bool read_sim_command = false;	// indicates beginging of prog info
  bool stop_exec = false;
  int stop_after = 0;
  int n = 0;
  bool info_on = false;
  uint64_t debug_delay = 0;

  // **** PRINTS USAGE IF NO ARGS ARE FOUND WITH PROGRAM
  if(argc == 1)
    usage();

  // **** COMMAND LINE DECODE
  for(int i = 1; i < argc; i++) {
    // Help and usage output
    if(!strcmp(argv[i], "-help") || !strcmp(argv[i], "-h"))
      prog_comm_error("help");
    // warn if several cycles without rob commit
    else if(!strcmp(argv[i], "-tornado_warn")) {
      if (read_sim_command) 
        prog_comm_error("tornado_warn");
      i++;
      tornado_warning = atoi(argv[i]);
      if(tornado_warning == 0)
        input_error(argv[i], "tornado_warn");
    }
    // compare instruction results at retirement with arch model
    else if(!strcmp(argv[i],"-arch_compare")) {	
      if (read_sim_command) 
        prog_comm_error("arch_compare");
      arch_compare_on = true;
    }
    // Trace mode prints out the PC to compare with arch-model program flow
    else if(!strcmp(argv[i],"-trace")) {	
      if (read_sim_command) 
        prog_comm_error("trace");
      trace_on = true;
    }
    else if(!strcmp(argv[i],"-trace_noarch")) {	
      if (read_sim_command) 
        prog_comm_error("trace_noarch");
      trace_on_noarch = true;
    }
    // Provides debug information as set by number based on which information
    // you would like to see
    else if(!strcmp(argv[i], "-debug")) {
      if (read_sim_command) 
        prog_comm_error("debug");
      i++;
      debug_mask = strtoul(argv[i], 0, 0); // allow octal, decimal or hex input
      if(debug_mask == 0)
        input_error(argv[i], "debug"); 
    }
    // Allows you to stop simulator after a certain number of statereg cycles
    else if(!strcmp(argv[i], "-stop")) {
      if (read_sim_command) 
        prog_comm_error("stop");
      i++;
      stop_after = atoi(argv[i]);
      if(stop_after == 0)
        input_error(argv[i], "stop");
      stop_exec = true;
    }
    // Allows to delay debug output until the given cycle
    else if(!strcmp(argv[i], "-debug_delay")) {
      if (read_sim_command) 
        prog_comm_error("debug_delay");
      i++;
      debug_delay = strtoull(argv[i], NULL, 0);
      if(debug_delay == 0)
        input_error(argv[i], "debug_delay");
    }    
    // Sets cache line size
    else if(!strcmp(argv[i], "-cache_lsize")) {
      if (read_sim_command) 
        prog_comm_error("cache_lsize");
      i++;
      switch(*argv[i]) {
      case 'D': case 'd':
	dL1_lsize = atoi(argv[i+1]);
	break;
      case 'I': case 'i':
	iL1_lsize = atoi(argv[i+1]);
	break;
      case 'L': case 'l':
	L2_lsize = atoi(argv[i+1]);
	break;
      default:
	input_error(argv[i], "cache_lsize");
	break;
      }
      i++;
      if(atoi(argv[i]) <= 0)
        input_error(argv[i], "cache_lsize");
    }
    // Sets cache associativity
    else if(!strcmp(argv[i], "-cache_assoc")) {
      if (read_sim_command) 
        prog_comm_error("cache_assoc");
      i++;
      switch(*argv[i]) {
      case 'D': case 'd':
	dL1_assoc = atoi(argv[i+1]);
	break;
      case 'I': case 'i':
	iL1_assoc = atoi(argv[i+1]);
	break;
      case 'L': case 'l':
	L2_assoc = atoi(argv[i+1]);
	break;
      default:
	input_error(argv[i], "cache_assoc");
	break;
      }
      i++;
      if(atoi(argv[i]) <= 0)
        input_error(argv[i], "cache_assoc");
    }
    // Sets cache lines
    else if(!strcmp(argv[i], "-cache_lines")) {
      if (read_sim_command) 
        prog_comm_error("cache_lines");
      i++;
      switch(*argv[i]) {
      case 'D': case 'd':
	dL1_lines = atoi(argv[i+1]);
	break;
      case 'I': case 'i':
	iL1_lines = atoi(argv[i+1]);
	break;
      case 'L': case 'l':
	L2_lines = atoi(argv[i+1]);
	break;
      default:
	input_error(argv[i], "cache_lines");
	break;
      }
      i++;
      if(atoi(argv[i]) <= 0)
        input_error(argv[i], "cache_lines");
    }
    // Sets cache delay
    else if(!strcmp(argv[i], "-cache_delay")) {
      if (read_sim_command) 
        prog_comm_error("cache_delay");
      i++;
      switch(*argv[i]) {
      case 'D': case 'd':
	dL1_delay = atoi(argv[i+1]);
	break;
      case 'I': case 'i':
	iL1_delay = atoi(argv[i+1]);
	break;
      case 'L': case 'l':
	L2_delay = atoi(argv[i+1]);
	break;
      default:
	input_error(argv[i], "cache_delay");
	break;
      }
      i++;
      if(atoi(argv[i]) <= 0)
        input_error(argv[i], "cache_delay");
    }
    // This is the number of bytes per cache line in powers of 2
    else if(!strcmp(argv[i], "-phys_regs")) {
      if (read_sim_command)
        prog_comm_error("phys_regs");
      i++;
      num_phys_regs = atoi(argv[i]);
    }
    else if(!strcmp(argv[i], "-sbrd_fifo")) {
      if (read_sim_command)
        prog_comm_error("sbrd_fifo");
      i++;
      scoreboard_fifo_size = atoi(argv[i]);
    }
    else if(!strcmp(argv[i], "-rob_size")) {
      if (read_sim_command)
        prog_comm_error("rob_size");
      i++;
      rob_size = atoi(argv[i]);
    }
    // Allows you to display the cache and latency settings
    else if(!strcmp(argv[i], "-info")) {
      if (read_sim_command) 
        prog_comm_error("info");
      info_on = true;
    }
    // Allows you to turn off branch prediction
    else if(!strcmp(argv[i], "-bp_off")) {
      if (read_sim_command) 
        prog_comm_error("bp_off");
      branch_pred_off = true;
    }
    // Allows you to turn off out-of-order issue
    else if(!strcmp(argv[i], "-speculate")) {
      if (read_sim_command) 
        prog_comm_error("speculate");
      branch_speculate = true;
    }
    // turn on 2x superpipelining
    else if(!strcmp(argv[i], "-superpipeline")) {
      if (read_sim_command) 
        prog_comm_error("superpipeline");
      i++;
      superpipeline_factor = atoi(argv[i]);
    }
    // Extracts argv and argc for simulator program
    else {
      if(!read_sim_command) {
        read_sim_command = true;
        sim_prog_argc = argc - i;
        sim_prog_argv = new char* [sim_prog_argc];
      }
      sim_prog_argv[n] = argv[i];
      n++;
    }
  }

  // **** MEMORY STRUCTURE DECLARATIONS ****
  sparse_memory the_mem;
  arch_model the_arch_model(&the_mem);


  execution_unit the_exec_unit(&the_mem);
  rob the_rob(&the_arch_model);
  scoreboard the_scoreboard;
  rename_stage the_renamer;
  decoder_circuit the_decoder;
  ifetch the_ifetch(&the_mem);

  // CONNECT UP PIPELINE
  // Use attach statements to attach inports to stateregs which can be
  // considered outputs when connecting pipeline stages
  // **** CONNECT INSTRUCTON FETCH INPORTS ****
  the_ifetch.branch_info.attach(&the_rob.branch_out);
  the_ifetch.scoreboard_stall.attach(&the_scoreboard.scoreboard_stall);
  the_ifetch.rename_stall.attach(&the_renamer.rename_stall);
  the_ifetch.rob_stall.attach(&the_rob.rob_stall);
  // **** CONNECT INSTRUCTION DECODER INPORTS ****
  the_decoder.instr_in.attach(&the_ifetch.notdecoded_instr);
  the_decoder.program_counter_in.attach(&the_ifetch.prev_program_counter);
  the_decoder.predicted_pc_in.attach(&the_ifetch.program_counter);
  the_decoder.local_prediction_in.attach(&the_ifetch.local_prediction);
  the_decoder.global_prediction_in.attach(&the_ifetch.global_prediction);
  the_decoder.global_history_in.attach(&the_ifetch.global_history);
  the_decoder.ra_stack_ptr_in.attach(&the_ifetch.ra_stack_ptr);
  the_decoder.ra_stack_head_in.attach(&the_ifetch.ra_stack_head);
  the_decoder.branch_mispredict.attach(&the_rob.branch_out);
  the_decoder.scoreboard_stall.attach(&the_scoreboard.scoreboard_stall);
  the_decoder.rename_stall.attach(&the_renamer.rename_stall);
  the_decoder.rob_stall.attach(&the_rob.rob_stall);
  // **** CONNECT RENAMER INPORTS ****
  the_renamer.instr_in.attach(&the_decoder.instr_out);
  the_renamer.branch_mispredict.attach(&the_rob.branch_out);
  the_renamer.scoreboard_stall.attach(&the_scoreboard.scoreboard_stall);
  the_renamer.retiring_instr.attach(&the_rob.retiring_instr);
  the_renamer.rob_stall.attach(&the_rob.rob_stall);
  // **** CONNECT SCOREBOARD INPORTS ****
  the_scoreboard.instr_in.attach(&the_renamer.instr_out);
  the_scoreboard.rob_stall.attach(&the_rob.rob_stall);
  the_scoreboard.rob_tail.attach(&the_rob.rob_tail);
  the_scoreboard.rob_head.attach(&the_rob.rob_head);
  the_scoreboard.branch_mispredict.attach(&the_rob.branch_out);
  the_scoreboard.writeback_bus.attach(&the_exec_unit.writeback_bus);
  the_scoreboard.branch_bus.attach(&the_exec_unit.branch_output);
  the_scoreboard.store_bus.attach(&the_exec_unit.store_output);
  // **** CONNECT REORDER BUFFER INPORTS ****
  the_rob.instr_in.attach(&the_renamer.instr_out);
  the_rob.branch_bus.attach(&the_exec_unit.branch_output);
  the_rob.writeback_bus.attach(&the_exec_unit.writeback_bus);
  the_rob.store_bus.attach(&the_exec_unit.store_output);
  // **** CONNECT EXECUTION STAGE INPORTS ****
  the_exec_unit.instr_in.attach(&the_scoreboard.instr_out);
  the_exec_unit.branch_mispredict.attach(&the_rob.branch_out);
  the_exec_unit.commit_head_store.attach(&the_rob.commit_head_store);


  datapath_reg_init dri;
  
  int retval = the_arch_model.load_elf_file(sim_prog_argv[0], sim_prog_argc, sim_prog_argv, &dri);
  if (retval == 0) {
    the_ifetch.program_counter.reset(dri.pc);

    // SET REGISTERS WITH POINTERS TO MAIN'S ARGV AND ARGC
    the_exec_unit.reg_init(&dri);
    the_arch_model.init_regs(&dri);
  }
  else {
    fprintf(stderr, "error loading program image %s\n", sim_prog_argv[0]);
    exit(1);
  }

  // **** THIS CODE RUNS PROGRAM - PROGRAM RUNS UNTIL HALT SIGNAL IS RECEIVED
  cycle_count = 0;
  delay_debug_mask = debug_mask;
  debug_mask = false;
  delay_trace_on = trace_on;
  trace_on = false;
		
  while (!the_arch_model.halted && !(stop_exec && (stop_after-- <= 0))) {
    if(debug_delay == cycle_count) {
      debug_mask = delay_debug_mask;
      trace_on = delay_trace_on;
    }

    if ((debug_mask > 0) && !trace_on)
      printf("--------------- cycle= %llu\t-----------------------------------------\n", cycle_count);

    clocked_net::pulse();	// Toggles statereg clocks to change values like
    // a register in real circuits.
    circuit::level();
    cycle_count++;
  }

  extern uint64_t the_instr_count;

  double ipc = (double) the_instr_count / (double) cycle_count;
  printf("\nNumber of cycles run: %llu\nNumber of instructions: %llu\n",
         cycle_count, the_instr_count);
  printf("utilization: %f\n\n", ipc);

  if (superpipeline_factor > 1) {
    printf("%dx clock gives equivalent speed to single-pump IPC of %.3f\n\n",
           superpipeline_factor, (double)superpipeline_factor * ipc);
  }

  printf("cache hits: %llu, cache misses: %llu\n\n", the_exec_unit.hits, the_exec_unit.misses);

  extern uint64_t the_retired_branch_count;
  extern uint64_t the_mispredicted_branch_count;

  printf("branches: %llu, mispredicted %llu\n\tmispredictiction rate: %.1f%%\n",
         the_retired_branch_count, the_mispredicted_branch_count,
         100.0 * (double)the_mispredicted_branch_count / (double)the_retired_branch_count);

  extern uint64_t return_count;
  extern uint64_t mispredicted_return_count;

  printf("returns: %llu, mispredicted %llu\n\tmisprediction rate: %.1f%%\n\taccounts for %.1f%% of mispredictions\n\tand %.1f%% of branches\n",
         return_count, mispredicted_return_count,
         100.0 * (double)mispredicted_return_count / (double)return_count,
         100.0 * (double)mispredicted_return_count / (double)the_mispredicted_branch_count,
         100.0 * (double)return_count / (double)the_retired_branch_count);

}
