Skip to main content

Choosing your place in memory

Memory management is a serious business, in modern computing the costs of using memory is a bit biased from the user by utilizing the wasteful clock cycles on other operations.

Performance related issues regarding memory can root to two common problems. First problem is cost of allocation. Often default supplied allocators like the one found in STL rely on the default implementation of malloc which doesn't always have optimal performance due thread-safety. Another problem is fragmentation, frequently allocating/deallocating causes holes of 'unusable' memory.

Most of the time these holes can be re-consumed if the size satisfies your needs. However heaps were invented for general purpose allocations of arbitrary length. It doesn't care what type of data will be stored, it just cares about giving it a spot. In practice there is nothing wrong with that approach, it will not break code, at worst it can only deteriorate performance.

The deterioration can be prevented to some extend by being critical about your data and access patterns of your data.  The result of my own personal needs resulted in the development of a allocation algorithm below.


#include <string>

typedef unsigned int uint32;
typedef signed int int32;
typedef unsigned short uint16;
typedef unsigned char uint8;

struct HeapAllocatorThunk
{
 void* (*malloc)(int size);
 void  (*free)(void* ptr);
};

struct DefaultMallocAlloctor
{
 static void* malloc(int size) { return ::malloc(size); }
 static void  free(void* ptr) { return ::free(ptr); };
 static HeapAllocatorThunk thunk(){
  HeapAllocatorThunk t;
  t.malloc = &DefaultMallocAlloctor::malloc;
  t.free = &DefaultMallocAlloctor::free;
  return t;
 };
};

template<class T> class HeapPool
{
public:
 static const int32 INITIAL_SIZE = 1;
 static const int32 BLOCK_SIZE = 4096;
 unsigned char**    blocks;
 uint16*      blockPartition;
 uint32      metaBlockCapacity, metaDataSize;
 int32 m_mask, m_arraySize, m_partition, m_used, m_index_lru;
 HeapAllocatorThunk m_allocator;

 HeapPool()
 {
  initialize();
  m_allocator = DefaultMallocAlloctor::thunk();
 }

 HeapPool(HeapAllocatorThunk& thunk)
 {
  initialize();
  m_allocator = thunk;
 }

 void initialize()
 {
  m_arraySize = INITIAL_SIZE;
  m_mask = INITIAL_SIZE - 1;
  m_partition = 0;
  m_used = 0;
  m_index_lru = 0;

  blocks = reinterpret_cast<unsigned char**>(malloc(INITIAL_SIZE * sizeof(unsigned char**)));
  blockPartition = reinterpret_cast<uint16*>(malloc(INITIAL_SIZE * sizeof(uint16)));
  memset( blocks, 0, sizeof(void*) * INITIAL_SIZE );
  memset( blockPartition, 0, sizeof(uint16) * INITIAL_SIZE );
  metaDataSize = sizeof(T);
  metaBlockCapacity = BLOCK_SIZE / (  sizeof(uint16) + sizeof(uint16) + metaDataSize );
 }

 void  prepare( void* block )
 {
  for( int i = 0, c = metaBlockCapacity; i < c; i++ ) {
   uint16* blockForwardMapping = reinterpret_cast<uint16*>( reinterpret_cast<uint8*>(block) + (0 * sizeof(uint16)));
   uint16* blockReverseMapping = reinterpret_cast<uint16*>( reinterpret_cast<uint8*>(block) + (c * sizeof(uint16)));
   blockForwardMapping[i] = blockReverseMapping[i] = i;
  }
 }

 void* allocate()
 {
  int counter = 0;
  while( m_partition > 0 )
  {
   if( blocks[m_index_lru] != 0x0 && blockPartition[m_index_lru] < metaBlockCapacity )
   {
    unsigned char* blockData = blocks[m_index_lru] + (metaBlockCapacity * 2 * sizeof(uint16));
    uint16* blockForwardMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] );
    uint16 typeId = blockForwardMapping[blockPartition[m_index_lru]++];
    uint16  offset = typeId * metaDataSize;
    m_used++;
    return &blockData[offset];
   }

   if( counter++ == m_arraySize ) {
    goto allocate_block;
   } else {
    m_index_lru = (m_index_lru + 1) % m_partition;
   }
  }  

 allocate_block:
  if( m_partition < m_arraySize )
  {
   m_index_lru = m_partition++;
   blocks[m_index_lru] = (unsigned char*)m_allocator.malloc( BLOCK_SIZE );
   prepare(blocks[m_index_lru]);

   unsigned char* blockData = blocks[m_index_lru] + (metaBlockCapacity * 2 * sizeof(uint16));
   uint16* blockForwardMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] );
   uint16 typeId = blockForwardMapping[blockPartition[m_index_lru]++];
   uint16  offset = typeId * metaDataSize;
   m_used++;
   return &blockData[offset];
  }
  else
  {
   //allocate new storage and clean...
   unsigned char**  newBlocks = reinterpret_cast<unsigned char**>(malloc((m_arraySize << 1) * sizeof(unsigned char**)));
   uint16* newPartition   = reinterpret_cast<uint16*>(malloc((m_arraySize << 1) * sizeof(uint16)));
   memset( newBlocks, 0, sizeof(void*) * (m_arraySize << 1) );
   memset( newPartition, 0, sizeof(uint16) * (m_arraySize << 1) );
   memcpy( newBlocks, blocks, sizeof(unsigned char*) * m_arraySize );
   memcpy( newPartition, blockPartition, sizeof(uint16) * m_arraySize ); 

   // Reset the field values, incl. the mask.
   ::free(blocks); ::free(blockPartition); blocks = newBlocks; blockPartition = newPartition;
   m_arraySize = (m_arraySize << 1);
   m_mask = (m_mask << 1) | 1;
   goto allocate_block;
  }    

  return 0x0;
 }

 void free(void* reference)
 {
  if( reference == 0x0 ) {
   return;
  }

  while( m_partition > 0 )
  {
   void* low = blocks[m_index_lru], *high = blocks[m_index_lru] + BLOCK_SIZE;
   if( reference >= low && reference < high ) {

    uint16* blockForwardMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] + (        0 * sizeof(uint16)));
    uint16* blockReverseMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] + (metaBlockCapacity * sizeof(uint16)));

    //Find the datastore indices, these corelated directly to the indices of the datastore
    uint16  offset = reinterpret_cast<unsigned char*>(reference) - blocks[m_index_lru];
    int32 dataStoreParitionIndex  = blockPartition[m_index_lru] - 1;
    int32 dataStoreCurrentIndex   = (offset - (sizeof(uint32) * metaBlockCapacity)) / metaDataSize;
    int32 indexStoreCurrentIndex  = blockReverseMapping[ dataStoreCurrentIndex ];
    int32 indexStoreParitionIndex = dataStoreParitionIndex;

    //Swap the two indices effectivly we move the last occupied element below the parition
    //index registering it as free and we move the element from that location on the 'freed'
    //location.
    uint16 A = blockForwardMapping[ indexStoreCurrentIndex  ];
    uint16 B = blockForwardMapping[ indexStoreParitionIndex ];
         blockForwardMapping[ indexStoreCurrentIndex  ] = B;
         blockForwardMapping[ indexStoreParitionIndex ] = A;

    //Swap our two values from the reverse lookup table. The reverse lookup table is an implicit
    //table lea
    uint16 C = blockReverseMapping[ A ];
    uint16 D = blockReverseMapping[ B ];
         blockReverseMapping[ A ] = D;
         blockReverseMapping[ B ] = C;
    blockPartition[m_index_lru]--;
    m_used--;

    if( blockPartition[m_index_lru] == 0 ) {
     void* data = blocks[m_index_lru];
     blocks[m_index_lru]   = blocks[m_partition - 1];
     blockPartition[m_index_lru] = blockPartition[m_partition - 1];
     blocks[m_partition - 1]    = 0;
     blockPartition[m_partition - 1]  = 0;
     m_index_lru = m_partition - 1;
     m_partition--;  m_allocator.free( data );
    }

    return;
   }

   m_index_lru = (m_index_lru + 1) % m_partition;
  }
 }
};

struct MeshNode
{
 int32  lod; int32  batchStart, batchCount; int32  vertRStart, vertREnd; int32  matrixId;
 void *material, *bounds, *geometry;
};

void* scratchPad[sizeof(HeapPool<MeshNode>)];
const int32 blockCount = 128;
uint8 block_data[4096 * blockCount];
void* block_dataIndexes[blockCount];
int32 block_partition;
HeapPool<MeshNode>& allocator = *reinterpret_cast<HeapPool<MeshNode>*>(&scratchPad[0]);

static void* block_malloc(int size)
{
 if( block_partition < 128 ) {
  return block_dataIndexes[block_partition++];
 }

 printf("fallback allocatern");
 return ::malloc(size);
}

static void  block_free(void* ptr)
{
 for( int i = 0; i < 128; i++ ) {
  if( ptr == block_dataIndexes[i] ) {
   block_dataIndexes[i] = block_dataIndexes[block_partition - 1];
   block_dataIndexes[block_partition - 1] = ptr;
   block_partition--;
   return;
  }
 }

 printf("fallback deallocatern");
 return ::free(ptr);
};

void main()
{
 //Fragment memory into fixed blocks of power-of-two..
 for( int i = 0; i < blockCount; i++ )
  block_dataIndexes[i] = &block_data[i * 4096];
 block_partition = 0;

 //Create a thunk for memory allocation/deallocation
 HeapAllocatorThunk t;
 t.malloc = &block_malloc;
 t.free = &block_free; 

 //Placement new
 new (scratchPad) HeapPool<MeshNode>(t);

 //Memory allocation test..
 void* item[400];
 for( int j = 0; j < 1000; j++ )
 {
  for( int i = 0; i < 400; i++ ) {
   item[i] = allocator.allocate();
  }

  for( int i = 0; i < 400; i++ ) {
   allocator.free(item[i]);
  }
 } 

 printf("readyrn");
 getchar();
}

Comments

Popular posts from this blog

The 8 Best U.S. Cities to Visit for a Quick Vacation

The best thing about visiting a new city is experiencing the thrill of adventure. From delicious food to rich history, there’s always something new to do. Whether you live close to these cities or you’re planning on making a trip to the USA, here's 8 of the best U.S. cities to visit on your next vacation (in no particular order): 1. Portland, Oregon As Oregon’s largest city, Portland has steadily been on the rise as a hotspot for food and beer connoisseurs. It’s nestled between the Columbia and Willamette Rivers with a stunning view of snowy Mount Hood which only adds to the thriving artistic culture. Portland is also home to beautiful parks, bridges and bike paths, making this city a top choice for outdoor adventurists. If you’re looking for more breathtaking escapades, Portland is nearby to a few national forests including Mount Hood National Forest and Gifford Pinchot National Forest. 2. Nashville, Tennessee Nashville rightfully owns

Material & shader management

In the upcoming changes in my editor I implemented the material system inspired on  Frostbite engine of DICE, binaries are download-able on the project page. Also I've implemented an conversion tool and file-format for future mesh formats using Assimp.

Securing your hardware wallet

Recently I found myself to test out an Trezor. For people not aware this is a hardware wallet for cryptocurrencies (such as Bitcoin). It was very happy with my new toy, however during the installation I noticed an rather opaque security flaw in the design.