Memory management is a serious business, in modern computing the costs of using memory is a bit biased from the user by utilizing the wasteful clock cycles on other operations.
Performance related issues regarding memory can root to two common problems. First problem is cost of allocation. Often default supplied allocators like the one found in STL rely on the default implementation of malloc which doesn't always have optimal performance due thread-safety. Another problem is fragmentation, frequently allocating/deallocating causes holes of 'unusable' memory.
Most of the time these holes can be re-consumed if the size satisfies your needs. However heaps were invented for general purpose allocations of arbitrary length. It doesn't care what type of data will be stored, it just cares about giving it a spot. In practice there is nothing wrong with that approach, it will not break code, at worst it can only deteriorate performance.
The deterioration can be prevented to some extend by being critical about your data and access patterns of your data. The result of my own personal needs resulted in the development of a allocation algorithm below.
Performance related issues regarding memory can root to two common problems. First problem is cost of allocation. Often default supplied allocators like the one found in STL rely on the default implementation of malloc which doesn't always have optimal performance due thread-safety. Another problem is fragmentation, frequently allocating/deallocating causes holes of 'unusable' memory.
Most of the time these holes can be re-consumed if the size satisfies your needs. However heaps were invented for general purpose allocations of arbitrary length. It doesn't care what type of data will be stored, it just cares about giving it a spot. In practice there is nothing wrong with that approach, it will not break code, at worst it can only deteriorate performance.
The deterioration can be prevented to some extend by being critical about your data and access patterns of your data. The result of my own personal needs resulted in the development of a allocation algorithm below.
#include <string> typedef unsigned int uint32; typedef signed int int32; typedef unsigned short uint16; typedef unsigned char uint8; struct HeapAllocatorThunk { void* (*malloc)(int size); void (*free)(void* ptr); }; struct DefaultMallocAlloctor { static void* malloc(int size) { return ::malloc(size); } static void free(void* ptr) { return ::free(ptr); }; static HeapAllocatorThunk thunk(){ HeapAllocatorThunk t; t.malloc = &DefaultMallocAlloctor::malloc; t.free = &DefaultMallocAlloctor::free; return t; }; }; template<class T> class HeapPool { public: static const int32 INITIAL_SIZE = 1; static const int32 BLOCK_SIZE = 4096; unsigned char** blocks; uint16* blockPartition; uint32 metaBlockCapacity, metaDataSize; int32 m_mask, m_arraySize, m_partition, m_used, m_index_lru; HeapAllocatorThunk m_allocator; HeapPool() { initialize(); m_allocator = DefaultMallocAlloctor::thunk(); } HeapPool(HeapAllocatorThunk& thunk) { initialize(); m_allocator = thunk; } void initialize() { m_arraySize = INITIAL_SIZE; m_mask = INITIAL_SIZE - 1; m_partition = 0; m_used = 0; m_index_lru = 0; blocks = reinterpret_cast<unsigned char**>(malloc(INITIAL_SIZE * sizeof(unsigned char**))); blockPartition = reinterpret_cast<uint16*>(malloc(INITIAL_SIZE * sizeof(uint16))); memset( blocks, 0, sizeof(void*) * INITIAL_SIZE ); memset( blockPartition, 0, sizeof(uint16) * INITIAL_SIZE ); metaDataSize = sizeof(T); metaBlockCapacity = BLOCK_SIZE / ( sizeof(uint16) + sizeof(uint16) + metaDataSize ); } void prepare( void* block ) { for( int i = 0, c = metaBlockCapacity; i < c; i++ ) { uint16* blockForwardMapping = reinterpret_cast<uint16*>( reinterpret_cast<uint8*>(block) + (0 * sizeof(uint16))); uint16* blockReverseMapping = reinterpret_cast<uint16*>( reinterpret_cast<uint8*>(block) + (c * sizeof(uint16))); blockForwardMapping[i] = blockReverseMapping[i] = i; } } void* allocate() { int counter = 0; while( m_partition > 0 ) { if( blocks[m_index_lru] != 0x0 && blockPartition[m_index_lru] < metaBlockCapacity ) { unsigned char* blockData = blocks[m_index_lru] + (metaBlockCapacity * 2 * sizeof(uint16)); uint16* blockForwardMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] ); uint16 typeId = blockForwardMapping[blockPartition[m_index_lru]++]; uint16 offset = typeId * metaDataSize; m_used++; return &blockData[offset]; } if( counter++ == m_arraySize ) { goto allocate_block; } else { m_index_lru = (m_index_lru + 1) % m_partition; } } allocate_block: if( m_partition < m_arraySize ) { m_index_lru = m_partition++; blocks[m_index_lru] = (unsigned char*)m_allocator.malloc( BLOCK_SIZE ); prepare(blocks[m_index_lru]); unsigned char* blockData = blocks[m_index_lru] + (metaBlockCapacity * 2 * sizeof(uint16)); uint16* blockForwardMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] ); uint16 typeId = blockForwardMapping[blockPartition[m_index_lru]++]; uint16 offset = typeId * metaDataSize; m_used++; return &blockData[offset]; } else { //allocate new storage and clean... unsigned char** newBlocks = reinterpret_cast<unsigned char**>(malloc((m_arraySize << 1) * sizeof(unsigned char**))); uint16* newPartition = reinterpret_cast<uint16*>(malloc((m_arraySize << 1) * sizeof(uint16))); memset( newBlocks, 0, sizeof(void*) * (m_arraySize << 1) ); memset( newPartition, 0, sizeof(uint16) * (m_arraySize << 1) ); memcpy( newBlocks, blocks, sizeof(unsigned char*) * m_arraySize ); memcpy( newPartition, blockPartition, sizeof(uint16) * m_arraySize ); // Reset the field values, incl. the mask. ::free(blocks); ::free(blockPartition); blocks = newBlocks; blockPartition = newPartition; m_arraySize = (m_arraySize << 1); m_mask = (m_mask << 1) | 1; goto allocate_block; } return 0x0; } void free(void* reference) { if( reference == 0x0 ) { return; } while( m_partition > 0 ) { void* low = blocks[m_index_lru], *high = blocks[m_index_lru] + BLOCK_SIZE; if( reference >= low && reference < high ) { uint16* blockForwardMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] + ( 0 * sizeof(uint16))); uint16* blockReverseMapping = reinterpret_cast<uint16*>( blocks[m_index_lru] + (metaBlockCapacity * sizeof(uint16))); //Find the datastore indices, these corelated directly to the indices of the datastore uint16 offset = reinterpret_cast<unsigned char*>(reference) - blocks[m_index_lru]; int32 dataStoreParitionIndex = blockPartition[m_index_lru] - 1; int32 dataStoreCurrentIndex = (offset - (sizeof(uint32) * metaBlockCapacity)) / metaDataSize; int32 indexStoreCurrentIndex = blockReverseMapping[ dataStoreCurrentIndex ]; int32 indexStoreParitionIndex = dataStoreParitionIndex; //Swap the two indices effectivly we move the last occupied element below the parition //index registering it as free and we move the element from that location on the 'freed' //location. uint16 A = blockForwardMapping[ indexStoreCurrentIndex ]; uint16 B = blockForwardMapping[ indexStoreParitionIndex ]; blockForwardMapping[ indexStoreCurrentIndex ] = B; blockForwardMapping[ indexStoreParitionIndex ] = A; //Swap our two values from the reverse lookup table. The reverse lookup table is an implicit //table lea uint16 C = blockReverseMapping[ A ]; uint16 D = blockReverseMapping[ B ]; blockReverseMapping[ A ] = D; blockReverseMapping[ B ] = C; blockPartition[m_index_lru]--; m_used--; if( blockPartition[m_index_lru] == 0 ) { void* data = blocks[m_index_lru]; blocks[m_index_lru] = blocks[m_partition - 1]; blockPartition[m_index_lru] = blockPartition[m_partition - 1]; blocks[m_partition - 1] = 0; blockPartition[m_partition - 1] = 0; m_index_lru = m_partition - 1; m_partition--; m_allocator.free( data ); } return; } m_index_lru = (m_index_lru + 1) % m_partition; } } }; struct MeshNode { int32 lod; int32 batchStart, batchCount; int32 vertRStart, vertREnd; int32 matrixId; void *material, *bounds, *geometry; }; void* scratchPad[sizeof(HeapPool<MeshNode>)]; const int32 blockCount = 128; uint8 block_data[4096 * blockCount]; void* block_dataIndexes[blockCount]; int32 block_partition; HeapPool<MeshNode>& allocator = *reinterpret_cast<HeapPool<MeshNode>*>(&scratchPad[0]); static void* block_malloc(int size) { if( block_partition < 128 ) { return block_dataIndexes[block_partition++]; } printf("fallback allocatern"); return ::malloc(size); } static void block_free(void* ptr) { for( int i = 0; i < 128; i++ ) { if( ptr == block_dataIndexes[i] ) { block_dataIndexes[i] = block_dataIndexes[block_partition - 1]; block_dataIndexes[block_partition - 1] = ptr; block_partition--; return; } } printf("fallback deallocatern"); return ::free(ptr); }; void main() { //Fragment memory into fixed blocks of power-of-two.. for( int i = 0; i < blockCount; i++ ) block_dataIndexes[i] = &block_data[i * 4096]; block_partition = 0; //Create a thunk for memory allocation/deallocation HeapAllocatorThunk t; t.malloc = &block_malloc; t.free = &block_free; //Placement new new (scratchPad) HeapPool<MeshNode>(t); //Memory allocation test.. void* item[400]; for( int j = 0; j < 1000; j++ ) { for( int i = 0; i < 400; i++ ) { item[i] = allocator.allocate(); } for( int i = 0; i < 400; i++ ) { allocator.free(item[i]); } } printf("readyrn"); getchar(); }
Comments
Post a Comment