Choosing your place in memory

Memory management is a serious business, in modern computing the costs of using memory is a bit biased from the user by utilizing the wasteful clock cycles on other operations.

Performance related issues regarding memory can root to two common problems. First problem is cost of allocation. Often default supplied allocators like the one found in STL rely on the default implementation of malloc which doesn't always have optimal performance due thread-safety. Another problem is fragmentation, frequently allocating/deallocating causes holes of 'unusable' memory.

Most of the time these holes can be re-consumed if the size satisfies your needs. However heaps were invented for general purpose allocations of arbitrary length. It doesn't care what type of data will be stored, it just cares about giving it a spot. In practice there is nothing wrong with that approach, it will not break code, at worst it can only deteriorate performance.

The deterioration can be prevented to some extend by being critical about your data and access patterns of your data. The result of my own personal needs resulted in the development of a allocation algorithm below.


#include <string>

typedef unsigned int uint32;
typedef signed int int32;
typedef unsigned short uint16;
typedef unsigned char uint8;

struct HeapAllocatorThunk
{
 void* (*malloc)(int size);
 void  (*free)(void* ptr);
};

struct DefaultMallocAlloctor
{
 static void* malloc(int size) { return ::malloc(size); }
 static void  free(void* ptr) { return ::free(ptr); };
 static HeapAllocatorThunk thunk(){
  HeapAllocatorThunk t;
  t.malloc = &DefaultMallocAlloctor::malloc;
  t.free = &DefaultMallocAlloctor::free;
  return t;
 };
};

template<class T> class HeapPool
{
public:
 static const int32 INITIAL_SIZE = 1;
 static const int32 BLOCK_SIZE = 4096;
 unsigned char**    blocks;
 uint16*      blockPartition;
 uint32      metaBlockCapacity, metaDataSize;
 int32 m_mask, m_arraySize, m_partition, m_used, m_index_lru;
 HeapAllocatorThunk m_allocator;

 HeapPool()
 {
  initialize();
  m_allocator = DefaultMallocAlloctor::thunk();
 }

 HeapPool(HeapAllocatorThunk& thunk)
 {
  initialize();
  m_allocator = thunk;
 }

 void initialize()
 {
  m_arraySize = INITIAL_SIZE;
  m_mask = INITIAL_SIZE - 1;
  m_partition = 0;
  m_used = 0;
  m_index_lru = 0;

  blocks = reinterpret_cast<unsigned char**>(malloc(INITIAL_SIZE * sizeof(unsigned char**)));
  blockPartition = reinterpret_cast<uint16*>(malloc(INITIAL_SIZE * sizeof(uint16)));
  memset( blocks, 0, sizeof(void*) * INITIAL_SIZE );
  memset( blockPartition, 0, sizeof(uint16) * INITIAL_SIZE );
  metaDataSize = sizeof(T);
  metaBlockCapacity = BLOCK_SIZE / (  sizeof(uint16) + sizeof(uint16) + metaDataSize );
 }

 void  prepare( void* block )
 {
  for( int i = 0, c = metaBlockCapacity; i < c; i++ ) {
   uint16* blockForwardMapping = reinterpret_cast<uint16*>( reinterpret_cast<uint8*>(block) + (0 * sizeof(uint16)));
   uint16* blockReverseMapping = reinterpret_cast<uint16*>( reinterpret_cast<uint8*>(block) + (c * sizeof(uint16)));
   blockForwardMapping[i] = blockReverseMapping[i] = i;
  }
 }

 void* allocate()
 {
  int counter = 0;
  while( m_partition > 0 )
  {
   if( blocks[m_index_lru] != 0x0 && blockPartition[m_index_lru] < metaBlockCapacity )
   {
    unsigned char* blockData = blocks[m_index_lru] + (metaBlockCapacity * 2 * sizeof(uint16));
    uint16* blockForwardMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] );
    uint16 typeId = blockForwardMapping[blockPartition[m_index_lru]++];
    uint16  offset = typeId * metaDataSize;
    m_used++;
    return &blockData[offset];
   }

   if( counter++ == m_arraySize ) {
    goto allocate_block;
   } else {
    m_index_lru = (m_index_lru + 1) % m_partition;
   }
  }  

 allocate_block:
  if( m_partition < m_arraySize )
  {
   m_index_lru = m_partition++;
   blocks[m_index_lru] = (unsigned char*)m_allocator.malloc( BLOCK_SIZE );
   prepare(blocks[m_index_lru]);

   unsigned char* blockData = blocks[m_index_lru] + (metaBlockCapacity * 2 * sizeof(uint16));
   uint16* blockForwardMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] );
   uint16 typeId = blockForwardMapping[blockPartition[m_index_lru]++];
   uint16  offset = typeId * metaDataSize;
   m_used++;
   return &blockData[offset];
  }
  else
  {
   //allocate new storage and clean...
   unsigned char**  newBlocks = reinterpret_cast<unsigned char**>(malloc((m_arraySize << 1) * sizeof(unsigned char**)));
   uint16* newPartition   = reinterpret_cast<uint16*>(malloc((m_arraySize << 1) * sizeof(uint16)));
   memset( newBlocks, 0, sizeof(void*) * (m_arraySize << 1) );
   memset( newPartition, 0, sizeof(uint16) * (m_arraySize << 1) );
   memcpy( newBlocks, blocks, sizeof(unsigned char*) * m_arraySize );
   memcpy( newPartition, blockPartition, sizeof(uint16) * m_arraySize ); 

   // Reset the field values, incl. the mask.
   ::free(blocks); ::free(blockPartition); blocks = newBlocks; blockPartition = newPartition;
   m_arraySize = (m_arraySize << 1);
   m_mask = (m_mask << 1) | 1;
   goto allocate_block;
  }    

  return 0x0;
 }

 void free(void* reference)
 {
  if( reference == 0x0 ) {
   return;
  }

  while( m_partition > 0 )
  {
   void* low = blocks[m_index_lru], *high = blocks[m_index_lru] + BLOCK_SIZE;
   if( reference >= low && reference < high ) {

    uint16* blockForwardMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] + (        0 * sizeof(uint16)));
    uint16* blockReverseMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] + (metaBlockCapacity * sizeof(uint16)));

    //Find the datastore indices, these corelated directly to the indices of the datastore
    uint16  offset = reinterpret_cast<unsigned char*>(reference) - blocks[m_index_lru];
    int32 dataStoreParitionIndex  = blockPartition[m_index_lru] - 1;
    int32 dataStoreCurrentIndex   = (offset - (sizeof(uint32) * metaBlockCapacity)) / metaDataSize;
    int32 indexStoreCurrentIndex  = blockReverseMapping[ dataStoreCurrentIndex ];
    int32 indexStoreParitionIndex = dataStoreParitionIndex;

    //Swap the two indices effectivly we move the last occupied element below the parition
    //index registering it as free and we move the element from that location on the 'freed'
    //location.
    uint16 A = blockForwardMapping[ indexStoreCurrentIndex  ];
    uint16 B = blockForwardMapping[ indexStoreParitionIndex ];
         blockForwardMapping[ indexStoreCurrentIndex  ] = B;
         blockForwardMapping[ indexStoreParitionIndex ] = A;

    //Swap our two values from the reverse lookup table. The reverse lookup table is an implicit
    //table lea
    uint16 C = blockReverseMapping[ A ];
    uint16 D = blockReverseMapping[ B ];
         blockReverseMapping[ A ] = D;
         blockReverseMapping[ B ] = C;
    blockPartition[m_index_lru]--;
    m_used--;

    if( blockPartition[m_index_lru] == 0 ) {
     void* data = blocks[m_index_lru];
     blocks[m_index_lru]   = blocks[m_partition - 1];
     blockPartition[m_index_lru] = blockPartition[m_partition - 1];
     blocks[m_partition - 1]    = 0;
     blockPartition[m_partition - 1]  = 0;
     m_index_lru = m_partition - 1;
     m_partition--;  m_allocator.free( data );
    }

    return;
   }

   m_index_lru = (m_index_lru + 1) % m_partition;
  }
 }
};

struct MeshNode
{
 int32  lod; int32  batchStart, batchCount; int32  vertRStart, vertREnd; int32  matrixId;
 void *material, *bounds, *geometry;
};

void* scratchPad[sizeof(HeapPool<MeshNode>)];
const int32 blockCount = 128;
uint8 block_data[4096 * blockCount];
void* block_dataIndexes[blockCount];
int32 block_partition;
HeapPool<MeshNode>& allocator = *reinterpret_cast<HeapPool<MeshNode>*>(&scratchPad[0]);

static void* block_malloc(int size)
{
 if( block_partition < 128 ) {
  return block_dataIndexes[block_partition++];
 }

 printf("fallback allocatern");
 return ::malloc(size);
}

static void  block_free(void* ptr)
{
 for( int i = 0; i < 128; i++ ) {
  if( ptr == block_dataIndexes[i] ) {
   block_dataIndexes[i] = block_dataIndexes[block_partition - 1];
   block_dataIndexes[block_partition - 1] = ptr;
   block_partition--;
   return;
  }
 }

 printf("fallback deallocatern");
 return ::free(ptr);
};

void main()
{
 //Fragment memory into fixed blocks of power-of-two..
 for( int i = 0; i < blockCount; i++ )
  block_dataIndexes[i] = &block_data[i * 4096];
 block_partition = 0;

 //Create a thunk for memory allocation/deallocation
 HeapAllocatorThunk t;
 t.malloc = &block_malloc;
 t.free = &block_free; 

 //Placement new
 new (scratchPad) HeapPool<MeshNode>(t);

 //Memory allocation test..
 void* item[400];
 for( int j = 0; j < 1000; j++ )
 {
  for( int i = 0; i < 400; i++ ) {
   item[i] = allocator.allocate();
  }

  for( int i = 0; i < 400; i++ ) {
   allocator.free(item[i]);
  }
 } 

 printf("readyrn");
 getchar();
}

Travel, Food & Development

Search This Blog

Choosing your place in memory

Labels

Comments

Post a Comment

Popular posts from this blog

Material & shader management

Exploring as a Digital Nomad

The 8 Best U.S. Cities to Visit for a Quick Vacation