Commit 26d7a13e authored by Daniel Krebs's avatar Daniel Krebs
Browse files

gpu: always allocate page-sized chunks, then use LinearAllocator

This was neccessary in order to make the memory available via GDRcopy
when multiple small allocations were made. cudaMalloc() would return
multiple memory chunks located in the same GPU page, which GDRcopy
pretty much dislikes (`gdrdrv:offset != 0 is not supported`).
As a side effect, this will keep the number of BAR-mappings done
via GDRcopy low, because they seem to be quite limited.
parent 2ef40409
......@@ -365,13 +365,31 @@ GpuAllocator::allocateBlock(size_t size)
void* addr;
if(cudaSuccess != cudaMalloc(&addr, size)) {
logger->error("cudaMalloc(..., size={}) failed", size);
auto& mm = MemoryManager::get();
// search for an existing chunk that has enough free memory
auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto& chunk) {
return chunk->getAvailableMemory() >= size;
if(chunk != chunks.end()) {
logger->debug("Found existing chunk that can host the requested block");
return (*chunk)->allocateBlock(size);
} else {
// allocate a new chunk
// rounded-up multiple of GPU page size
const size_t chunkSize = size - (size & (GpuPageSize - 1)) + GpuPageSize;
logger->debug("Allocate new chunk of {:#x} bytes", chunkSize);
if(cudaSuccess != cudaMalloc(&addr, chunkSize)) {
logger->error("cudaMalloc(..., size={}) failed", chunkSize);
throw std::bad_alloc();
auto& mm = MemoryManager::get();
// assemble name for this block
std::stringstream name;
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
......@@ -381,13 +399,19 @@ GpuAllocator::allocateBlock(size_t size)
const auto localAddr = reinterpret_cast<uintptr_t>(addr);
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
mem(new MemoryBlock(localAddr, chunkSize, blockAddrSpaceId), this->free);
// already make accessible to CPU
return mem;
// create a new allocator to manage the chunk and push to chunk list
// call again, this time there's a large enough chunk
return allocateBlock(size);
......@@ -62,6 +62,8 @@ private:
class GpuAllocator : public BaseAllocator<GpuAllocator> {
static constexpr size_t GpuPageSize = 64UL << 10;
GpuAllocator(Gpu& gpu);
std::string getName() const;
......@@ -71,6 +73,8 @@ public:
Gpu& gpu;
// TODO: replace by multimap (key is available memory)
std::list<std::unique_ptr<LinearAllocator>> chunks;
class GpuFactory : public Plugin {
