Commit dd485559 authored by Daniel Krebs's avatar Daniel Krebs

gpu: implement basic GPU plugin that can do DMA to and from its memory

Using CUDA, memory can be allocated on the GPU and shared to peers on
the PCIe bus such as the FPGA. Furthermore, the DMA on the GPU can also
be used to read and write to/from other memory on the PCIe bus, such as
BRAM on the FPGA.
parent bc9a1909
Pipeline #49303 failed with stages
in 16 seconds
......@@ -40,6 +40,7 @@ public:
Unknown,
FpgaIp,
FpgaCard,
Gpu
};
Plugin(Type type, const std::string& name);
......
cmake_minimum_required(VERSION 3.7)
cmake_minimum_required(VERSION 3.8)
project(villas-gpu
VERSION 1.0
DESCRIPTION "VILLASgpu"
LANGUAGES C CXX)
LANGUAGES C CXX CUDA)
# fail if CUDA not found
find_package(CUDA QUIET REQUIRED)
add_subdirectory(gdrcopy)
cuda_add_library(villas-gpu SHARED gpu.cpp)
add_library(villas-gpu SHARED gpu.cpp kernels.cu)
target_compile_options(villas-gpu PRIVATE -g)
set_source_files_properties(gpu.cpp PROPERTIES
LANGUAGE CUDA)
target_include_directories(villas-gpu PRIVATE /opt/cuda/include)
target_link_libraries(villas-gpu
PRIVATE villas-common gdrapi cuda)
target_include_directories(villas-gpu
PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include)
PUBLIC
${CMAKE_CURRENT_LIST_DIR}/include
PRIVATE
${CMAKE_CURRENT_LIST_DIR})
Subproject commit 2b933176d0fd20f10bddfdf574a1d3229ca1ecdf
Subproject commit 0441daa447b80260c4e11096f03e88f7be08bfa2
This diff is collapsed.
#pragma once
#include <sstream>
#include <plugin.hpp>
#include <memory_manager.hpp>
#include <memory.hpp>
#include <villas/log.hpp>
namespace villas {
namespace gpu {
class GpuAllocator;
class Gpu {
friend GpuAllocator;
public:
Gpu(int gpuId);
~Gpu();
bool init();
std::string getName() const;
GpuAllocator& getAllocator() const
{ return *allocator; }
bool makeAccessibleToPCIeAndVA(const MemoryBlock& mem);
/// Make some memory block accssible for this GPU
bool makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem);
void memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
void memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
private:
bool registerIoMemory(const MemoryBlock& mem);
bool registerHostMemory(const MemoryBlock& mem);
private:
class impl;
std::unique_ptr<impl> pImpl;
// master, will be used to derived slave addr spaces for allocation
MemoryManager::AddressSpaceId masterPciEAddrSpaceId;
MemoryManager::AddressSpaceId slaveMemoryAddrSpaceId;
SpdLogger logger;
int gpuId;
std::unique_ptr<GpuAllocator> allocator;
};
class GpuAllocator : public BaseAllocator<GpuAllocator> {
public:
GpuAllocator(Gpu& gpu);
std::string getName() const;
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
allocateBlock(size_t size);
private:
Gpu& gpu;
};
class GpuFactory : public Plugin {
public:
GpuFactory();
std::list<std::unique_ptr<Gpu>>
make();
void run(void*);
private:
SpdLogger logger;
};
} // namespace villas
} // namespace gpu
#include <stdio.h>
#include <villas/gpu.hpp>
#include "kernels.hpp"
#include "cuda_runtime.h"
#include <cuda.h>
namespace villas {
namespace gpu {
__global__ void
kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter)
{
printf("[gpu] hello!\n");
printf("[gpu] mailbox: %p\n", mailbox);
printf("[kernel] started\n");
while(1) {
if (*mailbox == 1) {
*mailbox = 0;
printf("[gpu] counter = %d\n", *counter);
break;
}
}
printf("[gpu] quit\n");
}
__global__ void
kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length)
{
while(length > 0) {
*dst++ = *src++;
length--;
}
}
} // namespace villas
} // namespace gpu
#pragma once
#include <cstdint>
#include <cuda_runtime.h>
namespace villas {
namespace gpu {
__global__ void
kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter);
__global__ void
kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length);
} // namespace villas
} // namespace gpu
......@@ -4,6 +4,7 @@ set(SOURCES
logging.cpp
dma.cpp
fifo.cpp
gpu.cpp
# hls.c
# intc.c
# rtds_rtt.c
......
#include <criterion/criterion.h>
#include <map>
#include <string>
#include <villas/log.hpp>
#include <villas/fpga/card.hpp>
#include <villas/fpga/ips/dma.hpp>
#include <villas/fpga/ips/bram.hpp>
#include <villas/utils.h>
#include "global.hpp"
#include <villas/memory.hpp>
#include <villas/gpu.hpp>
Test(fpga, gpu_dma, .description = "GPU DMA tests")
{
auto logger = loggerGetOrCreate("unittest:dma");
auto& card = state.cards.front();
villas::Plugin* plugin = villas::Plugin::lookup(villas::Plugin::Type::Gpu, "");
auto gpuPlugin = dynamic_cast<villas::gpu::GpuFactory*>(plugin);
cr_assert_not_null(gpuPlugin, "No GPU plugin found");
auto gpus = gpuPlugin->make();
cr_assert(gpus.size() > 0, "No GPUs found");
// just get first cpu
auto& gpu = gpus.front();
size_t count = 0;
for(auto& ip : card->ips) {
// skip non-dma IPs
if(*ip != villas::fpga::Vlnv("xilinx.com:ip:axi_bram_ctrl:"))
continue;
logger->info("Testing {}", *ip);
auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(ip.get());
cr_assert_not_null(bram, "Couldn't find BRAM");
count++;
size_t len = 4 * (1 << 10);
/* Allocate memory to use with DMA */
auto bram0 = bram->getAllocator().allocate<char>(len);
auto bram1 = bram->getAllocator().allocate<char>(len);
gpu->makeAccessibleFromPCIeOrHostRam(bram0.getMemoryBlock());
gpu->makeAccessibleFromPCIeOrHostRam(bram1.getMemoryBlock());
auto hostRam0 = villas::HostRam::getAllocator().allocate<char>(len);
auto hostRam1 = villas::HostRam::getAllocator().allocate<char>(len);
gpu->makeAccessibleFromPCIeOrHostRam(hostRam0.getMemoryBlock());
gpu->makeAccessibleFromPCIeOrHostRam(hostRam1.getMemoryBlock());
auto dmaRam0 = villas::HostDmaRam::getAllocator().allocate<char>(len);
auto dmaRam1 = villas::HostDmaRam::getAllocator().allocate<char>(len);
gpu->makeAccessibleFromPCIeOrHostRam(dmaRam0.getMemoryBlock());
gpu->makeAccessibleFromPCIeOrHostRam(dmaRam1.getMemoryBlock());
auto gpuMem0 = gpu->getAllocator().allocate<char>(64 << 10);
auto gpuMem1 = gpu->getAllocator().allocate<char>(64 << 10);
gpu->makeAccessibleToPCIeAndVA(gpuMem0.getMemoryBlock());
gpu->makeAccessibleToPCIeAndVA(gpuMem1.getMemoryBlock());
// auto& src = bram0;
// auto& dst = bram1;
// auto& src = hostRam0;
// auto& dst = hostRam1;
auto& src = dmaRam0;
// auto& dst = dmaRam1;
// auto& src = gpuMem0;
auto& dst = gpuMem1;
std::list<std::pair<std::string, std::function<void()>>> memcpyFuncs = {
{"cudaMemcpy", [&]() {gpu->memcpySync(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
{"CUDA kernel", [&]() {gpu->memcpyKernel(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
};
auto dmaIp = card->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_dma:"));
auto dma = dynamic_cast<villas::fpga::ip::Dma*>(dmaIp);
if(dma != nullptr and dma->connectLoopback()) {
memcpyFuncs.push_back({
"DMA memcpy", [&]() {
if(not dma->makeAccesibleFromVA(src.getMemoryBlock()) or
not dma->makeAccesibleFromVA(dst.getMemoryBlock())) {
return;
}
dma->memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len);
}});
}
for(auto& [name, memcpyFunc] : memcpyFuncs) {
logger->info("Testing {}", name);
/* Get new random data */
const size_t lenRandom = read_random(&src, len);
cr_assert(len == lenRandom, "Failed to get random data");
memcpyFunc();
const bool success = memcmp(&src, &dst, len) == 0;
logger->info(" {}", success ?
TXT_GREEN("Passed") :
TXT_RED("Failed"));
}
villas::MemoryManager::get().dump();
}
cr_assert(count > 0, "No BRAM found");
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment