Commit 26d7a13e authored by Daniel Krebs's avatar Daniel Krebs
Browse files

gpu: always allocate page-sized chunks, then use LinearAllocator

This was neccessary in order to make the memory available via GDRcopy
when multiple small allocations were made. cudaMalloc() would return
multiple memory chunks located in the same GPU page, which GDRcopy
pretty much dislikes (`gdrdrv:offset != 0 is not supported`).
As a side effect, this will keep the number of BAR-mappings done
via GDRcopy low, because they seem to be quite limited.
parent 2ef40409
...@@ -365,29 +365,53 @@ GpuAllocator::allocateBlock(size_t size) ...@@ -365,29 +365,53 @@ GpuAllocator::allocateBlock(size_t size)
cudaSetDevice(gpu.gpuId); cudaSetDevice(gpu.gpuId);
void* addr; void* addr;
if(cudaSuccess != cudaMalloc(&addr, size)) {
logger->error("cudaMalloc(..., size={}) failed", size);
throw std::bad_alloc();
}
auto& mm = MemoryManager::get(); auto& mm = MemoryManager::get();
// assemble name for this block // search for an existing chunk that has enough free memory
std::stringstream name; auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto& chunk) {
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr); return chunk->getAvailableMemory() >= size;
});
if(chunk != chunks.end()) {
logger->debug("Found existing chunk that can host the requested block");
return (*chunk)->allocateBlock(size);
} else {
// allocate a new chunk
// rounded-up multiple of GPU page size
const size_t chunkSize = size - (size & (GpuPageSize - 1)) + GpuPageSize;
logger->debug("Allocate new chunk of {:#x} bytes", chunkSize);
if(cudaSuccess != cudaMalloc(&addr, chunkSize)) {
logger->error("cudaMalloc(..., size={}) failed", chunkSize);
throw std::bad_alloc();
}
auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str()); // assemble name for this block
auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName); std::stringstream name;
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
const auto localAddr = reinterpret_cast<uintptr_t>(addr); auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn> auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
insertMemoryBlock(*mem); const auto localAddr = reinterpret_cast<uintptr_t>(addr);
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
mem(new MemoryBlock(localAddr, chunkSize, blockAddrSpaceId), this->free);
gpu.makeAccessibleToPCIeAndVA(*mem); insertMemoryBlock(*mem);
return mem; // already make accessible to CPU
gpu.makeAccessibleToPCIeAndVA(*mem);
// create a new allocator to manage the chunk and push to chunk list
chunks.push_front(std::make_unique<LinearAllocator>(std::move(mem)));
// call again, this time there's a large enough chunk
return allocateBlock(size);
}
} }
......
...@@ -62,6 +62,8 @@ private: ...@@ -62,6 +62,8 @@ private:
class GpuAllocator : public BaseAllocator<GpuAllocator> { class GpuAllocator : public BaseAllocator<GpuAllocator> {
public: public:
static constexpr size_t GpuPageSize = 64UL << 10;
GpuAllocator(Gpu& gpu); GpuAllocator(Gpu& gpu);
std::string getName() const; std::string getName() const;
...@@ -71,6 +73,8 @@ public: ...@@ -71,6 +73,8 @@ public:
private: private:
Gpu& gpu; Gpu& gpu;
// TODO: replace by multimap (key is available memory)
std::list<std::unique_ptr<LinearAllocator>> chunks;
}; };
class GpuFactory : public Plugin { class GpuFactory : public Plugin {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment