#ifndef _BITOPS_H
#define _BITOPS_H

#include <inttypes.h>

namespace bitops
{
  typedef union
  __ieee754_convert_union
  {
    float value;
    uint32_t word;
  } __ieee754_float_int_union;
 
  typedef union
  __ieee754_double_convert_union
  {
    double value;
    uint64_t dword;
  } __ieee754_double_int_union;
 
  inline
  float
  cvt_int2float(uint32_t u)
  {
    __ieee754_float_int_union fiu;
    fiu.word = u;
    return fiu.value;
  }

  inline
  uint64_t
  cvt_float2int(float f)
  {
    __ieee754_float_int_union fiu;
    fiu.value = f;
    // remember to sign extend to 64 bits!
    return (uint64_t)((int64_t)((int32_t)(fiu.word)));
  }

  inline
  double
  cvt_int2double(uint64_t u)
  {
    __ieee754_double_int_union diu;
    diu.dword = u;
    return diu.value;
  }

  inline
  uint64_t
  cvt_double2int(double d)
  {
    __ieee754_double_int_union diu;
    diu.value = d;
    return diu.dword;
  }

  inline
  uint32_t
  extract_byte(uint32_t rs, uint32_t rt)
  {
    return (rs >> ((rt & 0x3) * 8)) & 0xff;
  }

  inline
  uint32_t
  extract_halfword(uint32_t rs, uint32_t rt)
  {
    return (rs >> ((rt & 0x2) * 8)) & 0xffff;
  }

  inline
  uint32_t
  popcount_orig(uint32_t val)
  {
    val = ((val & 0x55555555) + ((val >> 1) & 0x55555555));
    val = ((val & 0x33333333) + ((val >> 2) & 0x33333333));
    val = ((val & 0x0f0f0f0f) + ((val >> 4) & 0x0f0f0f0f));
    val = ((val & 0x00ff00ff) + ((val >> 8) & 0x00ff00ff));
    val = ((val & 0x0000ffff) + (val >> 16));

    return val;
  }

  /* optimization of above that makes use of the fact that the sum of a
     pair of 2 bit values produces a 3 bit value (not a 4 bit) and the
     sum of a pair of 3 bit values produces a 4 bit value (not an 8
     bit), thus we can reduce the amount of masking we do.  This
     compiles to 21 instructions on MIPS: */
  inline
  uint32_t
  popcount(uint32_t val)
  {
    /* add together adjacent bits to produce 16 values in set: {00b,
     * 01b, 10b}  */
    val = ((val & 0x55555555) + ((val >> 1) & 0x55555555));
    /* add together adjacent 2-bit values to produce 8 values in
     * set: {0000b, 0001b, 0010b, 0011b, 0100b} */
    val = ((val & 0x33333333) + ((val >> 2) & 0x33333333));
    /* now add each of the 4 3-bit fields in the top half to the
     * corresponding 3-bit field in the bottom half to produce 4 values
     * in the set: {0000b, 0001b, 0010b, 011b, 0100b, 0101b, 0110b,
     * 0111b, 1000b}. */
    /* note that from here on we can use smaller masks (max 12 bit
     * instead of 32 bit), so that we get away with fewer instructions
     * on most RISC machines (e.g. MIPS, Sparc) */
    val = val + (val >> 16);
    /* add adjacent 4-bit fields to produce 2 5-bit fields in the bottom
     * two bytes */
    val = ((val & 0x0f0f) + ((val >> 4) & 0x0f0f));
    /* add the final two byte fields (each with max value 16) together
     * to produce the final value (range 0-32) in the bottom byte  */
    val = val + (val >> 8);
  
    /* mask out the garbage in the second byte */
    return (val & 0x3f);
  }

  inline
  uint32_t
  count_leading_zeros(uint32_t val)
  {
    val |= (val >> 1);          // fill in all the bits from the highest set to the lowest
    val |= (val >> 2);
    val |= (val >> 4);
    val |= (val >> 8);
    val |= (val >> 16);
    return 32 - popcount(val);
  }

  inline
  uint32_t
  count_trailing_zeros(uint32_t val)
  {
    val |= (val << 1);          // fill in all the bits from the highest set to the lowest
    val |= (val << 2);
    val |= (val << 4);
    val |= (val << 8);
    val |= (val << 16);
    return 32 - popcount(val);
  }
};

#endif /* _BITOPS_H */
