#include "lsq_pf.h"

// LDQ
ldq::ldq(uint32_t size, stq* stqptr) :
  the_ldq(size),
  ldq_size(size),
  ld_cnt(0),
  the_stq(stqptr)
{ }

// print the entire ldq
void ldq::print() {
  printf("LDQ: Cur Count %u Free Slots %u\n", ld_cnt, ldq_size - ld_cnt);
  // iterate through the ldq on a per-context basis, printing each ldq packet
  for (uint32_t c = 0; c < num_contexts; c++) {
    printf("Context %u Cur Count %u Free Slots %u\n", c, the_ldq.num_entries(c), the_ldq.num_free_slots_context(c));
    for (uint32_t slot = the_ldq.head(c); slot != the_ldq.invalid(); slot = the_ldq.get_next(slot)) { 
      ldq_packet* ldptr = the_ldq.get_element(slot);
      printf("slot %u ", slot);
      ldptr->print();
    }
  }
}

// print the specified context of the ldq
void ldq::printCtxt(uint32_t ctxt) {
  uint32_t c = ctxt;
  printf("LDQ: Context %u Cur Count %u Free Slots %u\n", c, the_ldq.num_entries(c), the_ldq.num_free_slots_context(c));
  for (uint32_t slot = the_ldq.head(c); slot != the_ldq.invalid(); slot = the_ldq.get_next(slot)) { 
    ldq_packet* ldptr = the_ldq.get_element(slot);
    printf("slot %u ", slot);
    ldptr->print();
  }
}

// print all completed loads to the specified address
void ldq::printAddr(uint32_t addr) {
  printf("LDQ: Addr %08x\n", addr);
  
  arb::iterator arb_it = the_arb.find(addr & ARB_ADDR_MASK);
  if (arb_it == the_arb.end())
    return;

  arb_entry::iterator addr_it = NULL;
  addr_it = arb_it->second.begin();

  while (addr_it != arb_it->second.end()) {
    size_t slot = addr_it->second;
    ldq_packet* ldptr = the_ldq.get_element((uint32_t)slot);
    printf("slot %u ", slot);
    ldptr->print();
    addr_it++;
  }
}

void ldq::printARB() {
  printf("LDQ: ARB\n");

  arb::iterator arb_it = the_arb.begin();
  while (arb_it != the_arb.end()) {
    printf("ADDR %08x:\n", arb_it->first);

    arb_entry::iterator addr_it = NULL;
    addr_it = arb_it->second.begin();
    while (addr_it != arb_it->second.end()) {
      size_t slot = addr_it->second;
      ldq_packet* ldptr = the_ldq.get_element((uint32_t)slot);
      printf("slot %u ", slot);
      ldptr->print();
      addr_it++;
    }

    arb_it++;
  }
}

// stall_loads: stall front end (from dispatch to fetch) when
//  number of free slots in ldq <= 1.
// Should be not used by SMT.
// Should be used by superscalar.
bool ldq::stall_loads() const {
  return ((ldq_size - ld_cnt) <= 1);
}

// smt_stall: for the specified context, stall fetch when number of free
//  slots in ldq (for that context) <= 8 + frontend_delay_size.
// Should be used by SMT.
// Should not be used by superscalar (unnecessarily conservative).
bool ldq::smt_stall(uint32_t ctxt) {
  return (the_ldq.num_free_slots_context(ctxt) <= (8 + frontend_delay_size));
}

bool ldq::full() const {
  uint32_t num_avail = ldq_size - ld_cnt;
  assert(num_avail >= 0);
  return (num_avail == 0);
}

bool ldq::empty() const {
  return (ld_cnt == 0);
}

bool ldq::full(uint32_t ctxt) {
  assert(the_ldq.num_free_slots_context(ctxt) >= 0);
  return (the_ldq.num_free_slots_context(ctxt) == 0);
}

bool ldq::empty(uint32_t ctxt) {
  return (the_ldq.num_entries(ctxt) == 0);
}

// allocate: allocate a slot in the ldq for the dispatched load
size_t ldq::allocate(uint32_t context, uint64_t ts) {
  assert(!full(context));
  ldq_packet pkt;
  pkt.inum = ts;
  pkt.context = context;

  // enqueue the packet in the hwll for its context
  uint32_t slot = the_ldq.push_tail(context, pkt);
  assert(slot != the_ldq.invalid());

  ld_cnt++;
  return slot;
}

void ldq::insert(size_t slot, uint32_t address, uint32_t size, uint32_t pc, size_t rob_slot, uint32_t ratchk, uint32_t prev_pc) {
  ldq_packet* ldptr = the_ldq.get_element((uint32_t)slot);  
  ldptr->size = size;
  ldptr->instr_pc = pc;
  ldptr->rob_slot = rob_slot;
  ldptr->rat_checkpt = ratchk;
  ldptr->prev_pc = prev_pc;
  ldptr->completed = true;
  ldptr->address = address;

  arb::iterator arb_it = the_arb.find(address & ARB_ADDR_MASK);
  if (arb_it == the_arb.end()) {
    // no in-flight, completed load to specified address
    arb_entry ae;
	pair<arb_entry::iterator, bool> ae_ins_res;
    ae_ins_res = ae.insert(arb_entry::value_type(ldptr->inum, slot));
	assert(ae_ins_res.second);

	pair<arb::iterator, bool> arb_ins_res;
    arb_ins_res = the_arb.insert(arb::value_type(address & ARB_ADDR_MASK, ae));
	assert(arb_ins_res.second);
  }
  else {
    // there is an in-flight, completed load to specified address
	arb_it->second.insert(arb_entry::value_type(ldptr->inum, slot));
  }

  return;
}

void ldq::commit_head(uint32_t context, uint64_t ts) {
  // verify that there is a load to be removed in this context
  assert(the_ldq.num_entries(context) > 0);

  // remove the load from the head of the ldq
  ldq_packet* ldptr = NULL;
  the_ldq.pop_head(context, &ldptr);

  // verify that the expected load was removed
  assert(ldptr);
  assert(ldptr->inum == ts);

  // remove the load from the arb
  arb::iterator arb_it = the_arb.find(ldptr->address & ARB_ADDR_MASK);
  if (arb_it != the_arb.end()) {
    arb_entry::iterator addr_it = arb_it->second.find(ldptr->inum);
    if (addr_it != arb_it->second.end())
      arb_it->second.erase(addr_it);
  }

  // clear the load's slot in the ldq
  *ldptr = ldq_packet();
  ld_cnt--;
}

// flush all canceled loads between the head of the ldq and the first not-canceled load
void ldq::flush_head_canceled(uint32_t ctxt) {
  assert(!empty(ctxt));

  uint32_t slot = the_ldq.head(ctxt);
  assert(slot != the_ldq.invalid());
  ldq_packet* ldptr = the_ldq.get_element(slot);
  while ((ldptr != NULL) && (ldptr->canceled)) {
    // remove the load from the head of the ldq
    the_ldq.pop_head(ctxt, &ldptr);

    arb::iterator arb_it = the_arb.find(ldptr->address & ARB_ADDR_MASK);
    if (arb_it != the_arb.end()) {
      arb_entry::iterator addr_it = arb_it->second.find(ldptr->inum);
      if (ldptr->completed) {
	assert(addr_it != arb_it->second.end());
	arb_it->second.erase(addr_it);
      }
      else {
	assert(addr_it == arb_it->second.end());
      }
    }

    // clear the load's slot in the ldq
    *ldptr = ldq_packet();
    ld_cnt--;

    // get the new head
    slot = the_ldq.head(ctxt);
    if (slot != the_ldq.invalid())
      ldptr = the_ldq.get_element(slot);
    else
      ldptr = NULL;
  }
}

bool ldq::load_valid(size_t slot, uint64_t ts) {
  ldq_packet* ldptr = NULL;
  ldptr = the_ldq.get_element((uint32_t)slot);
  return ((ldptr != NULL) && !ldptr->canceled && (ldptr->inum == ts));
}

void ldq::clobber(uint32_t context, uint64_t ts) {
  if (debug_mask & 0x0800)
    printf("LDQ> CLOBBER: ctext %d ts %llu\n", context, ts);

  bool clobber_done = false;
  uint32_t slot = the_ldq.tail(context);
  while ((slot != the_ldq.invalid()) && !clobber_done) { 
    ldq_packet* ldptr = the_ldq.get_element(slot);
    if (ldptr->inum >= ts) {
      ldptr->canceled = true;
    }
    else {
      clobber_done = true;
    }

    slot = the_ldq.get_prev(slot);	
  }

  return;
}

// Determines whether intermediate stores (between an earlier store and a later load)
//  have killed any bytes of the earlier store's value by overwriting those bytes.
// Returns a bitmask marking the bytes that are still live (have not been
//  overwritten by an intermediate store) as of ld_ts.
// st_ts: the lower bound on timestamps of intermediate stores that should be examined
// st_addr: addr of aligned dword accessed by the store/load pair
// exposed_bits: bitmask marking the bytes that are still live (have not been
//  overwritten by an intermediate store) as of st_ts-1.
// ld_ts: the upper bound on timestamps of intermediate stores that should be examined
uint8_t stq::exposed(uint32_t st_addr, uint64_t& st_ts, uint8_t exposed_bits, uint64_t ld_ts)
{
  arb::iterator arb_it = the_arb.find(st_addr & ARB_ADDR_MASK);
  assert(arb_it != the_arb.end());
  arb_entry::iterator addr_it = arb_it->second.lower_bound(st_ts);

  uint8_t ret_bits = exposed_bits;
  while (ret_bits && (addr_it != arb_it->second.end()) && (addr_it->first < ld_ts)) {
    size_t slot = addr_it->second;
    stq_packet* stptr = the_stq.get_element((uint32_t)slot);
    st_ts = stptr->inum + 1; 

    // Allow the intermediate store to kill byte(s) of the earlier store
    //  if the following conditions are met:
    // 1. The intermediate store is completed and not canceled.
    // 2. The intermediate store is NOT a badpath store
    if (stptr->completed && !stptr->canceled)
      ret_bits = ret_bits & ~get_access_bitmask(stptr->address, stptr->size);

    addr_it++;
  }

  return ret_bits;
}

// If the store conflicts with a completed load, returns true and returns
//  the earliest such load's info in conf.
// If the store does not conflict with a completed load, returns false.
bool ldq::conflict_check(uint32_t st_addr, uint32_t st_size, uint32_t st_ctext, uint64_t st_ts, ldq_packet& conf) {

  if (debug_mask & 0x0800)
    printf("TRUE MEM DEP CHECK: store addr %08x size %u ctext %u ts %llu\n", st_addr, st_size, st_ctext, st_ts);

  uint8_t exposed_bits = get_access_bitmask(st_addr, st_size);

  arb::iterator arb_it = the_arb.find(st_addr & ARB_ADDR_MASK);
  // If there are no in-flight, completed loads to the store's address, then there
  //  is no dependence violation.
  //       Note that other stores also find no match, but we'll have to identify
  //        such stores in the loop below.
  if (arb_it == the_arb.end())
    return false;

  //       find the earliest load to the store's addr that is later than the store.
  bool earliest_ld_found = false;
  arb_entry::iterator addr_it = arb_it->second.lower_bound(st_ts);
  uint64_t next_st_ts = st_ts + 1;
  while ((addr_it != arb_it->second.end()) && exposed_bits) {
    exposed_bits = the_stq->exposed(st_addr, next_st_ts, exposed_bits, addr_it->first);

    size_t ld_slot = addr_it->second;
    ldq_packet* ldptr = the_ldq.get_element((uint32_t)ld_slot);

    if (debug_mask & 0x0800) {
      printf("   exposed bits: %02x, checking load: ", exposed_bits);
      ldptr->print();
    }

    // A load conflicts with the specified store only under the following conditions:
    // 1. The load is completed and not canceled
    // 2. The load should have received data from the store (intervening stores
    //     to the same address have not killed the value of the specified store).
    //     This test is simplified by the fact that we know (from the arb) that
    //     the specified store and any loads being tested both access some portion
    //     of the same aligned dword.
    if ( (ldptr->completed && !ldptr->canceled) &&
         (exposed_bits & get_access_bitmask(ldptr->address, ldptr->size))) {
      // found conflicting load
      if (debug_mask & 0x0800)
        printf("   conflict detected\n");

      if (!earliest_ld_found) {
        // this load is the earliest conflicting load
        conf = *ldptr;
        earliest_ld_found = true;
      }
    }

    addr_it++;
  }

  // misspeculated load?
  return earliest_ld_found;
}

// STQ
stq::stq(uint32_t size) :
  the_stq(size),
  stq_size(size),
  st_cnt(0),
  working_data(0),
  working_vaddr(0xdead2fec),
  working_inum(~0),
  working_fwd_mask(0x0000)
{ }

// print the entire stq
void stq::print() {
  printf("STQ: Cur Count %u Free Slots %u\n", st_cnt, stq_size - st_cnt);
  // iterate through the stq on a per-context basis, printing each stq packet
  for (uint32_t c = 0; c < num_contexts; c++) {
    printf("Context %u Cur Count %u Free Slots %u\n", c, the_stq.num_entries(c), the_stq.num_free_slots_context(c));
    for (uint32_t slot = the_stq.head(c); slot != the_stq.invalid(); slot = the_stq.get_next(slot)) { 
      stq_packet* stptr = the_stq.get_element(slot);
      printf("slot %u ", slot);
      stptr->print();
    }
  }
}

// print the specified context of the stq
void stq::printCtxt(uint32_t ctxt) {
  uint32_t c = ctxt;
  printf("STQ: Context %u Cur Count %u Free Slots %u\n", c, the_stq.num_entries(c), the_stq.num_free_slots_context(c));
  for (uint32_t slot = the_stq.head(c); slot != the_stq.invalid(); slot = the_stq.get_next(slot)) { 
    stq_packet* stptr = the_stq.get_element(slot);
    printf("slot %u ", slot);
    stptr->print();
  }
}

// print all completed stores to the specified address
void stq::printAddr(uint32_t addr) {
  printf("STQ: Addr %08x\n", addr);
  
  arb::iterator arb_it = the_arb.find(addr & ARB_ADDR_MASK);
  if (arb_it == the_arb.end())
    return;

  arb_entry::iterator addr_it = NULL;
  addr_it = arb_it->second.begin();

  while (addr_it != arb_it->second.end()) {
    size_t slot = addr_it->second;
    stq_packet* stptr = the_stq.get_element((uint32_t)slot);
    printf("slot %u ", slot);
    stptr->print();
    addr_it++;
  }
}

void stq::printARB() {
  printf("STQ: ARB\n");

  arb::iterator arb_it = the_arb.begin();
  while (arb_it != the_arb.end()) {
    printf("ADDR %08x:\n", arb_it->first);

    arb_entry::iterator addr_it = NULL;
    addr_it = arb_it->second.begin();
    while (addr_it != arb_it->second.end()) {
      size_t slot = addr_it->second;
      stq_packet* stptr = the_stq.get_element((uint32_t)slot);
      printf("slot %u ", slot);
      stptr->print();
      addr_it++;
    }

    arb_it++;
  }
}

// stall_stores: stall front end (from dispatch to fetch) when
//  number of free slots in stq <= 1.
// Should not be used by SMT.
// Should be used by superscalar.
bool stq::stall_stores() const {
  return ((stq_size - st_cnt) <= 1);
}

// smt_stall: for the specified context, stall fetch when number of free
//  slots in stq (for that context) <= 8 + frontend_delay_size.
// Should be used by SMT.
// Should not be used by superscalar (unnecessarily conservative).
bool stq::smt_stall(uint32_t ctxt) {
  return (the_stq.num_free_slots_context(ctxt) <= (8 + frontend_delay_size));
}

bool stq::full() const {
  uint32_t num_avail = stq_size - st_cnt;
  assert(num_avail >= 0);
  return (num_avail == 0);
}

bool stq::empty() const {
  return (st_cnt == 0);
}

bool stq::full(uint32_t ctxt) {
  assert(the_stq.num_free_slots_context(ctxt) >= 0);
  return (the_stq.num_free_slots_context(ctxt) == 0);
}

bool stq::empty(uint32_t ctxt) {
  return (the_stq.num_entries(ctxt) == 0);
}

// allocate: allocate a slot in the stq for the dispatched store
size_t stq::allocate(uint32_t context, uint64_t ts) {
  assert(!full());
  stq_packet pkt;
  pkt.inum = ts;
  pkt.context = context;

  // enqueue the packet in the hwll for its context
  size_t slot = the_stq.push_tail(context, pkt);
  assert(slot != the_stq.invalid());

  st_cnt++;
  return slot;
}

// write the store into the stq as it completes
void stq::insert(size_t slot, uint32_t address, uint64_t data, uint32_t size) {
  stq_packet* stptr = the_stq.get_element((uint32_t)slot);
  stptr->address = address;
  stptr->data = data;
  stptr->size = size;
  stptr->completed = true;

  arb::iterator arb_it = the_arb.find(address & ARB_ADDR_MASK);
  if (arb_it == the_arb.end()) {
    // no in-flight, completed load to specified address
    arb_entry ae;
    pair<arb_entry::iterator, bool> ae_ins_res;
    ae_ins_res = ae.insert(arb_entry::value_type(stptr->inum, slot));
    assert(ae_ins_res.second);

    pair<arb::iterator, bool> arb_ins_res;
    arb_ins_res = the_arb.insert(arb::value_type(address & ARB_ADDR_MASK, ae));
    assert(arb_ins_res.second);
  }
  else {
    // there is an in-flight, completed load to specified address
    arb_it->second.insert(arb_entry::value_type(stptr->inum, slot));
  }

  return;
}

// arch-model actually commits changes to the memory
void stq::commit_head(uint32_t context, uint64_t ts) {
  // verify that there is a store to be removed in this context
  assert(st_cnt > 0);

  // remove the store from the head of the stq
  stq_packet* stptr = NULL;
  the_stq.pop_head(context, &stptr);

  // verify that the expected store was removed
  assert(stptr);
  assert(stptr->inum == ts);

  // remove the store from the arb
  arb::iterator arb_it = the_arb.find(stptr->address & ARB_ADDR_MASK);
  if (arb_it != the_arb.end()) {
    arb_entry::iterator addr_it = arb_it->second.find(stptr->inum);
    if (addr_it != arb_it->second.end())
      arb_it->second.erase(addr_it);
  }
    
  // clear the store's slot in the stq
  *stptr = stq_packet();
  st_cnt--;
}

// flush all canceled stores between the head of the stq and the first not-canceled store
void stq::flush_head_canceled(uint32_t ctxt) {
  assert(!empty(ctxt));

  uint32_t slot = the_stq.head(ctxt);
  assert(slot != the_stq.invalid());
  stq_packet* stptr = the_stq.get_element(slot);
  while ((stptr != NULL) && (stptr->canceled)) {
    // remove the store from the head of the stq
    the_stq.pop_head(ctxt, &stptr);

    arb::iterator arb_it = the_arb.find(stptr->address & ARB_ADDR_MASK);
    if (arb_it != the_arb.end()) {
      arb_entry::iterator addr_it = arb_it->second.find(stptr->inum);
      if (stptr->completed) {
	assert(addr_it != arb_it->second.end());
	arb_it->second.erase(addr_it);
      }
      else {
	assert(addr_it == arb_it->second.end());
      }
    }

    // clear the store's slot in the stq
    *stptr = stq_packet();
    st_cnt--;

    // get the new head
    slot = the_stq.head(ctxt);
    if (slot != the_stq.invalid())
      stptr = the_stq.get_element(slot);
    else
      stptr = NULL;
  }
}

void stq::clobber(uint32_t context, uint64_t ts) {
  if (debug_mask & 0x0800)
    printf("STQ> CLOBBER: ctext %d ts %llu\n", context, ts);

  bool clobber_done = false;
  uint32_t slot = the_stq.tail(context);
  while ((slot != the_stq.invalid()) && !clobber_done) { 
    stq_packet* stptr = the_stq.get_element(slot);
    if (stptr->inum >= ts) {
      stptr->canceled = true;
    }
    else {
      clobber_done = true;
    }
    slot = the_stq.get_prev(slot);	
  }

  return;
}

// For the specified addr, get the current value of the corresponding aligned dword
//  from the cache/mem hierarchy (working_data). As we search the stq from its head
//  to the last completed store prior to the load, we will apply stores to the
//  aligned dword to working_data, thereby arriving at address's in-flight value.
// Set up the other working fields that will be used to forward data from the 
//  stq/cache/mem hierarchy to the load.
void stq::initiate_load(uint32_t vaddr, sparse_memory* the_mem, uint64_t instr_num) { 
  working_vaddr = vaddr & ~0x7;
  working_data = CVT_ENDIAN_DWORD(the_mem->load_uint64(working_vaddr));
  working_inum = instr_num;
  working_fwd_mask = 0x00;
}

// working_data is an abstraction: a copy of the current value of the aligned
//  dword that a load is accessing, with the "current value" obtained from the
//  cache/mem hierarchy.
// get_paddr return a pointer into working_data which points to the actual
//  bytes that the load is accessing. For example, if working_data contains
//  the aligned dword at addr 0x0080 (bytes 0x0080 to 0x0087), and the load
//  is accessing address 0x0084, then get_paddr returns a pointer to the
void* stq::get_paddr(uint32_t vaddr) {
  uint32_t paddr = (uint32_t)(&working_data) + (vaddr & 0x7);
  return (void*)paddr;
}

// Given an in-flight store to the working address (the aligned dword accessed
//  by the load), apply that store's effects to the working data.
void stq::forward_store(stq_packet& the_store) {
  assert(working_vaddr == (the_store.address & ARB_ADDR_MASK));
  void* my_paddr = get_paddr(the_store.address);

  switch (the_store.size) {
  case 1:
    *((uint8_t*)my_paddr) = (uint8_t)(the_store.data);
    break;
  case 2:
    *((uint16_t*)my_paddr) = CVT_ENDIAN_HWORD((uint16_t)(the_store.data));
    break;
  case 4:
    *((uint32_t*)my_paddr) = CVT_ENDIAN_WORD((uint32_t)(the_store.data));
    break;
  case 8:
    *((uint64_t*)my_paddr) = CVT_ENDIAN_DWORD(the_store.data);
    break;
  }
}

// Forward the in-flight value of the working address to the load.
// context: the context of the load.
void stq::forward_queue(uint32_t ctxt, uint32_t addr, uint8_t size) {
  if (debug_mask & 0x0800) {
    printf("forwarding to load (ts %llu)\n", working_inum);
    printf("forward_queue>\n");
  }

  // Mark the stores that actually forward their values to this load
  vector<stq_packet*> fwding_stores;

  // Move thru stq's from nonspeculative flow to most speculative flow, in order.
  // Move through the entries in each stq, cumulatively forwarding the matching
  //  stores to the load.
  arb::iterator arb_it = the_arb.find(working_vaddr & ARB_ADDR_MASK);
  // If there are no in-flight, completed stores to the load's address, then
  //  the value obtained from the cache/mem hierarchy is the correct value.
  if (arb_it == the_arb.end())
    return;

  // find the earliest store to the load's dword aligned addr
  arb_entry::iterator addr_it = arb_it->second.lower_bound(0);
  bool checked_all_earlier_stores = false;
  while ((addr_it != arb_it->second.end()) && !checked_all_earlier_stores) {
    size_t st_slot = addr_it->second;
    stq_packet* stptr = the_stq.get_element((uint32_t)st_slot);

    if (debug_mask & 0x0800) {
      printf("forwarding store: slot %u ", st_slot);
      stptr->print();
    }

    // A store modifies the aligned dword access by the load only under the
    //  following conditions:
    // 1. The store is completed and not canceled
    // 2. The store is earlier than the load in program order
    if ( (stptr->completed && !stptr->canceled) &&
         (stptr->inum < working_inum)) {
      forward_store(*stptr);
      working_fwd_mask = working_fwd_mask | get_access_bitmask(stptr->address, stptr->size);
    }

    checked_all_earlier_stores = ((stptr->inum > working_inum) && (!stptr->canceled) && (stptr->completed));
    addr_it++;
  }
}

uint32_t stq::get_num_stores(uint32_t ctxt) {
  return (the_stq.num_entries(ctxt));
}

uint64_t stq::load_int8(uint64_t instr_num, uint32_t vaddr, uint32_t context, uint8_t size, sparse_memory* the_mem, bool& cache_access) {
  initiate_load(vaddr, the_mem, instr_num);
  forward_queue(context, vaddr, size);
  uint8_t* my_paddr = (uint8_t*)get_paddr(vaddr);
  uint8_t data = *my_paddr;
  uint8_t access_mask = get_access_bitmask(vaddr, 1);
  cache_access = ((access_mask & working_fwd_mask) != access_mask);
  return (uint64_t)((int64_t)((int8_t)data));
}

uint64_t stq::load_uint8(uint64_t instr_num, uint32_t vaddr, uint32_t context, uint8_t size, sparse_memory* the_mem, bool& cache_access) {
  initiate_load(vaddr, the_mem, instr_num);
  forward_queue(context, vaddr, size);
  uint8_t* my_paddr = (uint8_t*)get_paddr(vaddr);
  uint8_t data = *my_paddr;
  uint8_t access_mask = get_access_bitmask(vaddr, 1);
  cache_access = ((access_mask & working_fwd_mask) != access_mask);
  return ((uint64_t)data);
}

uint64_t stq::load_int16(uint64_t instr_num, uint32_t vaddr, uint32_t context, uint8_t size, sparse_memory* the_mem, bool& cache_access) {
  initiate_load(vaddr, the_mem, instr_num);
  forward_queue(context, vaddr, size);
  uint16_t* my_paddr = (uint16_t*)get_paddr(vaddr);
  uint16_t memdata = *my_paddr;
  uint16_t data = CVT_ENDIAN_HWORD(memdata);
  uint8_t access_mask = get_access_bitmask(vaddr, 2);
  cache_access = ((access_mask & working_fwd_mask) != access_mask);
  return (uint64_t)((int64_t)((int16_t)data));
}

uint64_t stq::load_uint16(uint64_t instr_num, uint32_t vaddr, uint32_t context, uint8_t size, sparse_memory* the_mem, bool& cache_access) {
  initiate_load(vaddr, the_mem, instr_num);
  forward_queue(context, vaddr, size);
  uint16_t* my_paddr = (uint16_t*)get_paddr(vaddr);
  uint16_t memdata = *my_paddr;
  uint16_t data = CVT_ENDIAN_HWORD(memdata);
  uint8_t access_mask = get_access_bitmask(vaddr, 2);
  cache_access = ((access_mask & working_fwd_mask) != access_mask);
  return ((uint64_t)data);
}

uint64_t stq::load_int32(uint64_t instr_num, uint32_t vaddr, uint32_t context, uint8_t size, sparse_memory* the_mem, bool& cache_access) {
  initiate_load(vaddr, the_mem, instr_num);
  forward_queue(context, vaddr, size);
  uint32_t* my_paddr = (uint32_t*)get_paddr(vaddr);
  uint32_t memdata = *my_paddr;
  uint32_t data = CVT_ENDIAN_WORD(memdata);
  uint8_t access_mask = get_access_bitmask(vaddr, 4);
  cache_access = ((access_mask & working_fwd_mask) != access_mask);
  return (uint64_t)((int64_t)((int32_t)data));
}

uint64_t stq::load_uint32(uint64_t instr_num, uint32_t vaddr, uint32_t context, uint8_t size, sparse_memory* the_mem, bool& cache_access) {
  initiate_load(vaddr, the_mem, instr_num);
  forward_queue(context, vaddr, size);
  uint32_t* my_paddr = (uint32_t*)get_paddr(vaddr);
  uint32_t memdata = *my_paddr;
  uint32_t data = CVT_ENDIAN_WORD(memdata);
  uint8_t access_mask = get_access_bitmask(vaddr, 4);
  cache_access = ((access_mask & working_fwd_mask) != access_mask);
  return ((uint64_t)data);
}

uint64_t stq::load_uint64(uint64_t instr_num, uint32_t vaddr, uint32_t context, uint8_t size, sparse_memory* the_mem, bool& cache_access) {
  initiate_load(vaddr, the_mem, instr_num);
  forward_queue(context, vaddr, size);
  uint64_t* my_paddr = (uint64_t*)get_paddr(vaddr);
  uint64_t memdata = *my_paddr;
  uint64_t data = CVT_ENDIAN_DWORD(memdata);
  uint8_t access_mask = get_access_bitmask(vaddr, 8);
  cache_access = ((access_mask & working_fwd_mask) != access_mask);
  return data;
}
