Commit 26d7a13e authored by Daniel Krebs's avatar Daniel Krebs
Browse files

gpu: always allocate page-sized chunks, then use LinearAllocator

This was neccessary in order to make the memory available via GDRcopy
when multiple small allocations were made. cudaMalloc() would return
multiple memory chunks located in the same GPU page, which GDRcopy
pretty much dislikes (`gdrdrv:offset != 0 is not supported`).
As a side effect, this will keep the number of BAR-mappings done
via GDRcopy low, because they seem to be quite limited.
parent 2ef40409
......@@ -365,29 +365,53 @@ GpuAllocator::allocateBlock(size_t size)
cudaSetDevice(gpu.gpuId);
void* addr;
if(cudaSuccess != cudaMalloc(&addr, size)) {
logger->error("cudaMalloc(..., size={}) failed", size);
throw std::bad_alloc();
}
auto& mm = MemoryManager::get();
// assemble name for this block
std::stringstream name;
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
// search for an existing chunk that has enough free memory
auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto& chunk) {
return chunk->getAvailableMemory() >= size;
});
if(chunk != chunks.end()) {
logger->debug("Found existing chunk that can host the requested block");
return (*chunk)->allocateBlock(size);
} else {
// allocate a new chunk
// rounded-up multiple of GPU page size
const size_t chunkSize = size - (size & (GpuPageSize - 1)) + GpuPageSize;
logger->debug("Allocate new chunk of {:#x} bytes", chunkSize);
if(cudaSuccess != cudaMalloc(&addr, chunkSize)) {
logger->error("cudaMalloc(..., size={}) failed", chunkSize);
throw std::bad_alloc();
}
auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
// assemble name for this block
std::stringstream name;
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
const auto localAddr = reinterpret_cast<uintptr_t>(addr);
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
insertMemoryBlock(*mem);
const auto localAddr = reinterpret_cast<uintptr_t>(addr);
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
mem(new MemoryBlock(localAddr, chunkSize, blockAddrSpaceId), this->free);
gpu.makeAccessibleToPCIeAndVA(*mem);
insertMemoryBlock(*mem);
return mem;
// already make accessible to CPU
gpu.makeAccessibleToPCIeAndVA(*mem);
// create a new allocator to manage the chunk and push to chunk list
chunks.push_front(std::make_unique<LinearAllocator>(std::move(mem)));
// call again, this time there's a large enough chunk
return allocateBlock(size);
}
}
......
......@@ -62,6 +62,8 @@ private:
class GpuAllocator : public BaseAllocator<GpuAllocator> {
public:
static constexpr size_t GpuPageSize = 64UL << 10;
GpuAllocator(Gpu& gpu);
std::string getName() const;
......@@ -71,6 +73,8 @@ public:
private:
Gpu& gpu;
// TODO: replace by multimap (key is available memory)
std::list<std::unique_ptr<LinearAllocator>> chunks;
};
class GpuFactory : public Plugin {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment