gpu.cpp 14.6 KB
Newer Older
1
2
3
4
5
6
7
#include <cstdio>
#include <cstdint>
#include <sys/mman.h>

#include <memory>
#include <algorithm>

8
#include <villas/gpu.hpp>
9
10
11
12
13
14
15
16
17
#include <villas/log.hpp>
#include <villas/kernel/pci.h>
#include <memory_manager.hpp>

#include <cuda.h>
#include <cuda_runtime.h>
#include <gdrapi.h>

#include "kernels.hpp"
18
19
20
21

namespace villas {
namespace gpu {

22
23
24
25
26
27
28
29
30
31
32
static GpuFactory gpuFactory;

GpuAllocator::GpuAllocator(Gpu& gpu) :
    BaseAllocator(gpu.masterPciEAddrSpaceId),
    gpu(gpu)
{
	free = [&](MemoryBlock* mem) {
		cudaSetDevice(gpu.gpuId);
		if(cudaFree(reinterpret_cast<void*>(mem->getOffset())) != cudaSuccess) {
			logger->warn("cudaFree() failed for {:#x} of size {:#x}",
			             mem->getOffset(), mem->getSize());
33
34
35
		} else {
			logger->debug("cudaFree({:#x}) successfull (size was {:#x})",
			              mem->getOffset(), mem->getSize());
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
		}

		removeMemoryBlock(*mem);
	};
}

std::string
villas::gpu::GpuAllocator::getName() const
{
	std::stringstream name;
	name << "GpuAlloc" << getAddrSpaceId();
	return name.str();
}


GpuFactory::GpuFactory() :
    Plugin(Plugin::Type::Gpu, "GPU")
{
	logger = loggerGetOrCreate("GpuFactory");
}

// required to be defined here for PIMPL to compile
Gpu::~Gpu()
{
	auto& mm = MemoryManager::get();
	mm.removeAddressSpace(masterPciEAddrSpaceId);
}


// we use PIMPL in order to hide gdrcopy types from the public header
class Gpu::impl {
public:
	gdr_t gdr;
	struct pci_device pdev;
};

std::string Gpu::getName() const
{
	cudaDeviceProp deviceProp;
	if(cudaGetDeviceProperties(&deviceProp, gpuId) != cudaSuccess) {
		// logger not yet availabe
		loggerGetOrCreate("Gpu")->error("Cannot retrieve properties for GPU {}", gpuId);
		throw std::exception();
	}

	std::stringstream name;
	name << "gpu" << gpuId << "(" << deviceProp.name << ")";

	return name.str();
}

bool Gpu::registerIoMemory(const MemoryBlock& mem)
{
	auto& mm = MemoryManager::get();
	const auto pciAddrSpaceId = mm.getPciAddressSpace();

	// Check if we need to map anything at all, maybe it's already reachable
	try {
		// TODO: there might already be a path through the graph, but there's no
		// overlapping window, so this will fail badly!
		auto translation = mm.getTranslation(masterPciEAddrSpaceId,
		                                     mem.getAddrSpaceId());
		if(translation.getSize() >= mem.getSize()) {
			// there is already a sufficient path
			logger->debug("Already mapped through another mapping");
			return true;
		} else {
			logger->warn("There's already a mapping, but too small");
		}
	} catch(const std::out_of_range&) {
		// not yet reachable, that's okay, proceed
	}


	// In order to register IO memory with CUDA, it has to be mapped to the VA
	// space of the current process (requirement of CUDA API). Check this now.
	MemoryManager::AddressSpaceId mappedBaseAddrSpaceId;
	try {
		auto path = mm.findPath(mm.getProcessAddressSpace(), mem.getAddrSpaceId());
		// first node in path is the mapped memory space whose virtual address
		// we need to hand to CUDA
		mappedBaseAddrSpaceId = path.front();
	} catch (const std::out_of_range&) {
		logger->error("Memory not reachable from process, but required by CUDA");
		return false;
	}

	// determine the base address of the mapped memory region needed by CUDA
	const auto translationProcess = mm.getTranslationFromProcess(mappedBaseAddrSpaceId);
	const uintptr_t baseAddrForProcess = translationProcess.getLocalAddr(0);


	// Now check that the memory is also reachable via PCIe bus, otherwise GPU
	// has no means to access it.
	uintptr_t baseAddrOnPci;
	size_t	sizeOnPci;
	try {
		auto translationPci = mm.getTranslation(pciAddrSpaceId,
		                                        mappedBaseAddrSpaceId);
		baseAddrOnPci = translationPci.getLocalAddr(0);
		sizeOnPci = translationPci.getSize();
	} catch(const std::out_of_range&) {
		logger->error("Memory is not reachable via PCIe bus");
		return false;
	}

	if(sizeOnPci < mem.getSize()) {
		logger->warn("VA mapping of IO memory is too small: {:#x} instead of {:#x} bytes",
		              sizeOnPci, mem.getSize());
		logger->warn("If something later on fails or behaves strangely, this might be the cause!");
	}


	cudaSetDevice(gpuId);

	auto baseAddrVA = reinterpret_cast<void*>(baseAddrForProcess);
	if(cudaHostRegister(baseAddrVA, sizeOnPci, cudaHostRegisterIoMemory) != cudaSuccess) {
		logger->error("Cannot register IO memory for block {}", mem.getAddrSpaceId());
		return false;
	}

	void* devicePointer = nullptr;
	if(cudaHostGetDevicePointer(&devicePointer, baseAddrVA, 0) != cudaSuccess) {
		logger->error("Cannot retrieve device pointer for IO memory");
		return false;
	}

	mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), baseAddrOnPci,
	                 sizeOnPci, "CudaIoMem", masterPciEAddrSpaceId, pciAddrSpaceId);

	return true;
}

bool
Gpu::registerHostMemory(const MemoryBlock& mem)
{
	auto& mm = MemoryManager::get();

	auto translation = mm.getTranslationFromProcess(mem.getAddrSpaceId());
	auto localBase = reinterpret_cast<void*>(translation.getLocalAddr(0));

	int ret = cudaHostRegister(localBase, mem.getSize(), 0);
	if(ret != cudaSuccess) {
		logger->error("Cannot register memory block {} addr={:p} size={:#x} to CUDA: ret={}",
		              mem.getAddrSpaceId(), localBase, mem.getSize(), ret);
		return false;
	}

	void* devicePointer = nullptr;
	ret = cudaHostGetDevicePointer(&devicePointer, localBase, 0);
	if(ret != cudaSuccess) {
		logger->error("Cannot retrieve device pointer for IO memory: ret={}", ret);
		return false;
	}

	mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), 0, mem.getSize(),
	                 "CudaHostMem", masterPciEAddrSpaceId, mem.getAddrSpaceId());

	return true;
}

bool Gpu::makeAccessibleToPCIeAndVA(const MemoryBlock& mem)
{
	if(pImpl->gdr == nullptr) {
		logger->warn("GDRcopy not available");
		return false;
	}

	auto& mm = MemoryManager::get();

	try {
		auto path = mm.findPath(masterPciEAddrSpaceId, mem.getAddrSpaceId());
		// if first hop is the PCIe bus, we know that memory is off-GPU
		if(path.front() == mm.getPciAddressSpace()) {
			throw std::out_of_range("Memory block is outside of this GPU");
		}

	} catch (const std::out_of_range&) {
		logger->error("Trying to map non-GPU memory block");
		return false;
	}

	logger->debug("retrieve complete device pointer from point of view of GPU");
	// retrieve complete device pointer from point of view of GPU
	auto translation = mm.getTranslation(masterPciEAddrSpaceId,
	                                     mem.getAddrSpaceId());
	CUdeviceptr devptr = translation.getLocalAddr(0);

	int ret;

	// required to set this flag before mapping
	unsigned int enable = 1;
	ret = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, devptr);
	if(ret != CUDA_SUCCESS) {
		logger->error("Cannot set pointer attributes on memory block {}: {}",
		              mem.getAddrSpaceId(), ret);
		return false;
	}

	gdr_mh_t mh;
	ret = gdr_pin_buffer(pImpl->gdr, devptr, mem.getSize(), 0, 0, &mh);
	if(ret != 0) {
		logger->error("Cannot pin memory block {} via gdrcopy: {}",
		              mem.getAddrSpaceId(), ret);
		return false;
	}

	void* bar = nullptr;
	ret = gdr_map(pImpl->gdr, mh, &bar, mem.getSize());
	if(ret != 0) {
		logger->error("Cannot map memory block {} via gdrcopy: {}",
		              mem.getAddrSpaceId(), ret);
		return false;
	}

	gdr_info_t info;
	ret = gdr_get_info(pImpl->gdr, mh, &info);
	if(ret != 0) {
		logger->error("Cannot get info for mapping of memory block {}: {}",
		              mem.getAddrSpaceId(), ret);
		return false;
	}

	const uintptr_t offset = info.va - devptr;
	const uintptr_t userPtr = reinterpret_cast<uintptr_t>(bar) + offset;

	logger->debug("BAR ptr:          {:p}",  bar);
	logger->debug("info.va:          {:#x}", info.va);
	logger->debug("info.mapped_size: {:#x}", info.mapped_size);
	logger->debug("info.page_size:   {:#x}", info.page_size);
	logger->debug("offset:           {:#x}", offset);
	logger->debug("user pointer:     {:#x}", userPtr);

	// mapping to acceses memory block from process
	mm.createMapping(userPtr, 0, info.mapped_size, "GDRcopy",
	                 mm.getProcessAddressSpace(), mem.getAddrSpaceId());

	// retrieve bus address
274
275
276
	static constexpr size_t maxDmaAddresses = 1 << 10;
	uint64_t addr[maxDmaAddresses];
	ret = gdr_map_dma(pImpl->gdr, mh, 3, 0, 0, addr, maxDmaAddresses);
277
278
279
280
281
282

	for(int i = 0; i < ret; i++) {
		logger->debug("DMA addr[{}]:      {:#x}", i, addr[i]);
	}

	if(ret != 1) {
283
		logger->warn("Only one DMA address per block supported at the moment, use first");
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
	}

	// mapping to access memory block from peer devices via PCIe
	mm.createMapping(addr[0], 0, mem.getSize(), "GDRcopyDMA",
	                 mm.getPciAddressSpace(), mem.getAddrSpaceId());

	return true;
}

bool
Gpu::makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem)
{
	// Check which kind of memory this is and where it resides
	// There are two possibilities:
	//  - Host memory not managed by CUDA
	//  - IO memory somewhere on the PCIe bus

	auto& mm = MemoryManager::get();

	bool isIoMemory = false;
	try {
		auto path = mm.findPath(mm.getPciAddressSpace(), mem.getAddrSpaceId());
		isIoMemory = true;
	} catch(const std::out_of_range&) {
		// not reachable via PCI -> not IO memory
	}

	if(isIoMemory) {
		logger->debug("Memory block {} is assumed to be IO memory",
		              mem.getAddrSpaceId());

		return registerIoMemory(mem);
	} else {
		logger->debug("Memory block {} is assumed to be non-CUDA host memory",
		              mem.getAddrSpaceId());

		return registerHostMemory(mem);
	}
}

void Gpu::memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
{
	auto& mm = MemoryManager::get();

	auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
	                                         src.getAddrSpaceId());
	const void* src_buf = reinterpret_cast<void*>(src_translation.getLocalAddr(0));

	auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
	                                         dst.getAddrSpaceId());
	void* dst_buf = reinterpret_cast<void*>(dst_translation.getLocalAddr(0));

	cudaSetDevice(gpuId);
	cudaMemcpy(dst_buf, src_buf, size, cudaMemcpyDefault);
}

void Gpu::memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
{
	auto& mm = MemoryManager::get();

	auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
	                                         src.getAddrSpaceId());
	auto src_buf = reinterpret_cast<uint8_t*>(src_translation.getLocalAddr(0));

	auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
	                                         dst.getAddrSpaceId());
	auto dst_buf = reinterpret_cast<uint8_t*>(dst_translation.getLocalAddr(0));

	cudaSetDevice(gpuId);
	kernel_memcpy<<<1, 1>>>(dst_buf, src_buf, size);
	cudaDeviceSynchronize();
}

Daniel Krebs's avatar
Daniel Krebs committed
357
358
359
360
361
362
363
MemoryTranslation
Gpu::translate(const MemoryBlock& dst)
{
	auto& mm = MemoryManager::get();
	return mm.getTranslation(masterPciEAddrSpaceId, dst.getAddrSpaceId());
}

364
365
366
static long roundUp(long n, long m) {
	return n >= 0 ? ((n + m - 1) / m) * m : (n / m) * m;
}
367
368
369
370
371
372
373
374
375

std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
GpuAllocator::allocateBlock(size_t size)
{
	cudaSetDevice(gpu.gpuId);

	void* addr;
	auto& mm = MemoryManager::get();

376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
	// search for an existing chunk that has enough free memory
	auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto& chunk) {
		return chunk->getAvailableMemory() >= size;
	});


	if(chunk != chunks.end()) {
		logger->debug("Found existing chunk that can host the requested block");

		return (*chunk)->allocateBlock(size);

	} else {
		// allocate a new chunk

		// rounded-up multiple of GPU page size
391
392
		const size_t chunkSize = roundUp(size, GpuPageSize);

393
394
395
396
397
398
		logger->debug("Allocate new chunk of {:#x} bytes", chunkSize);

		if(cudaSuccess != cudaMalloc(&addr, chunkSize)) {
			logger->error("cudaMalloc(..., size={}) failed", chunkSize);
			throw std::bad_alloc();
		}
399

400
401
402
		// assemble name for this block
		std::stringstream name;
		name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
403

404
405
		auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
		auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
406

407
408
409
		const auto localAddr = reinterpret_cast<uintptr_t>(addr);
		std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
		        mem(new MemoryBlock(localAddr, chunkSize, blockAddrSpaceId), this->free);
410

411
		insertMemoryBlock(*mem);
412

413
414
415
416
417
418
419
420
421
		// already make accessible to CPU
		gpu.makeAccessibleToPCIeAndVA(*mem);

		// create a new allocator to manage the chunk and push to chunk list
		chunks.push_front(std::make_unique<LinearAllocator>(std::move(mem)));

		// call again, this time there's a large enough chunk
		return allocateBlock(size);
	}
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
}


Gpu::Gpu(int gpuId) :
    pImpl{std::make_unique<impl>()},
    gpuId(gpuId)
{
	logger = loggerGetOrCreate(getName());

	pImpl->gdr = gdr_open();
	if(pImpl->gdr == nullptr) {
		logger->warn("No GDRcopy support enabled, cannot open /dev/gdrdrv");
	}
}

bool Gpu::init()
{
	auto& mm = MemoryManager::get();

	const auto gpuPciEAddrSpaceName = mm.getMasterAddrSpaceName(getName(), "PCIe");
	masterPciEAddrSpaceId = mm.getOrCreateAddressSpace(gpuPciEAddrSpaceName);

	allocator = std::make_unique<GpuAllocator>(*this);

	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, gpuId);

	pImpl->pdev.slot = {
	    deviceProp.pciDomainID,
	    deviceProp.pciBusID,
	    deviceProp.pciDeviceID,
	    0};

	struct pci_region* pci_regions = nullptr;
	const size_t pci_num_regions = pci_get_regions(&pImpl->pdev, &pci_regions);
	for(size_t i = 0; i < pci_num_regions; i++) {
		const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
		logger->info("BAR{}: bus addr={:#x} size={:#x}",
		             pci_regions[i].num, pci_regions[i].start, region_size);

		char name[] = "BARx";
		name[3] = '0' + pci_regions[i].num;

		auto gpuBarXAddrSpaceName = mm.getSlaveAddrSpaceName(getName(), name);
		auto gpuBarXAddrSpaceId = mm.getOrCreateAddressSpace(gpuBarXAddrSpaceName);

		mm.createMapping(pci_regions[i].start, 0, region_size,
		                 std::string("PCI-") + name,
		                 mm.getPciAddressSpace(), gpuBarXAddrSpaceId);
	}

	free(pci_regions);

	return true;
}


std::list<std::unique_ptr<Gpu>>
GpuFactory::make()
{
	int deviceCount = 0;
	cudaGetDeviceCount(&deviceCount);

	std::list<std::unique_ptr<Gpu>> gpuList;

	for(int gpuId = 0; gpuId < deviceCount; gpuId++) {
		if(cudaSetDevice(gpuId) != cudaSuccess) {
			logger->warn("Cannot activate GPU {}", gpuId);
			continue;
		}

		auto gpu = std::make_unique<Gpu>(gpuId);

		if(not gpu->init()) {
			logger->warn("Cannot initialize GPU {}", gpuId);
			continue;
		}

		gpuList.emplace_back(std::move(gpu));
	}

	logger->info("Initialized {} GPUs", gpuList.size());
	for(auto& gpu : gpuList) {
		logger->debug(" - {}", gpu->getName());
	}

	return gpuList;
}

511
512
} // namespace villas
} // namespace gpu