...
 
Commits (13)
......@@ -341,7 +341,7 @@
"num_ports": 8
},
"hier_0_gpu2rtds_0": {
"vlnv": "acs.eonerc.rwth-aachen.de:hls:gpu2rtds:1.0",
"vlnv": "acs.eonerc.rwth-aachen.de:hls:gpu2rtds:1.2",
"ports": [
{
"role": "master",
......@@ -369,7 +369,7 @@
}
},
"hier_0_rtds2gpu_0": {
"vlnv": "acs.eonerc.rwth-aachen.de:hls:rtds2gpu:1.1",
"vlnv": "acs.eonerc.rwth-aachen.de:hls:rtds2gpu:1.2",
"memory-view": {
"m_axi_axi_mm": {
"pcie_0_axi_pcie_0": {
......
......@@ -19,7 +19,10 @@ public:
bool init();
void dump(spdlog::level::level_enum logLevel = spdlog::level::info);
void dump()
{ dumpLog(); }
bool dumpLog(spdlog::level::level_enum logLevel = spdlog::level::info);
bool startOnce(size_t frameSize);
size_t getMaxFrameSize();
......
......@@ -32,7 +32,10 @@ public:
bool init();
void dump(spdlog::level::level_enum logLevel = spdlog::level::info);
void dump()
{ dumpLog(); }
bool dumpLog(spdlog::level::level_enum logLevel = spdlog::level::info);
bool startOnce(const MemoryBlock& mem, size_t frameSize, size_t dataOffset, size_t doorbellOffset);
......@@ -43,6 +46,10 @@ public:
bool doorbellIsValid(const uint32_t& doorbellRegister) const
{ return reinterpret_cast<const reg_doorbell_t&>(doorbellRegister).is_valid; }
const axilite_reg_status_t&
getStatusRegister()
{ updateStatus(); return status; }
void doorbellReset(uint32_t& doorbellRegister) const
{ doorbellRegister = 0; }
......
......@@ -19,6 +19,29 @@ union axilite_reg_status_t {
};
};
/*
* Access functions for status register to handle offset in register
* representation because of size constraints.
*/
static inline void
setStatusMaxFrameSize(volatile axilite_reg_status_t& reg, uint32_t value)
{ reg.max_frame_size = value - 1; }
static inline void
setStatusLastCount(volatile axilite_reg_status_t& reg, uint32_t value)
{ reg.last_count = value - 1; }
static inline uint32_t
getStatusLastCount(const volatile axilite_reg_status_t& reg)
{ return reg.last_count + 1; }
static inline uint32_t
getStatusMaxFrameSize(const volatile axilite_reg_status_t& reg)
{ return reg.max_frame_size + 1; }
union reg_doorbell_t {
uint32_t value;
struct {
......@@ -31,6 +54,24 @@ union reg_doorbell_t {
constexpr reg_doorbell_t() : value(0) {}
};
/*
* Access functions for doorbell register to handle offset in register
* representation because of size constraints.
*/
static inline void
setDoorbellCount(volatile reg_doorbell_t& reg, uint32_t value)
{ reg.count = value - 1; }
static inline uint32_t
getDoorbellCount(const volatile reg_doorbell_t& reg)
{ return reg.count + 1; }
template<size_t N, typename T = uint32_t>
struct Rtds2GpuMemoryBuffer {
// this type is only for memory interpretation, it makes no sense to create
......@@ -48,7 +89,7 @@ struct Rtds2GpuMemoryBuffer {
// HACK: This might break horribly, let's just hope C++17 will be there soon
static constexpr size_t dataOffset = 0;
static constexpr size_t doorbellOffset = N * sizeof(Rtds2GpuMemoryBuffer::data);
static constexpr size_t doorbellOffset = sizeof(Rtds2GpuMemoryBuffer::data);
T data[N];
reg_doorbell_t doorbell;
......
......@@ -6,48 +6,48 @@
// ==============================================================
// CTRL
// 0x00 : Control signals
// bit 0 - ap_start (Read/Write/COH)
// bit 1 - ap_done (Read/COR)
// bit 2 - ap_idle (Read)
// bit 3 - ap_ready (Read)
// bit 7 - auto_restart (Read/Write)
// others - reserved
// 0x04 : Global Interrupt Enable Register
// bit 0 - Global Interrupt Enable (Read/Write)
// others - reserved
// 0x08 : IP Interrupt Enable Register (Read/Write)
// bit 0 - Channel 0 (ap_done)
// bit 1 - Channel 1 (ap_ready)
// others - reserved
// 0x0c : IP Interrupt Status Register (Read/TOW)
// bit 0 - Channel 0 (ap_done)
// bit 1 - Channel 1 (ap_ready)
// others - reserved
// 0x10 : Data signal of frame_size
// bit 31~0 - frame_size[31:0] (Read/Write)
// 0x14 : reserved
// 0x80 : Data signal of status
// bit 31~0 - status[31:0] (Read)
// 0x84 : Control signal of status
// bit 0 - status_ap_vld (Read/COR)
// others - reserved
// 0x40 ~
// 0x7f : Memory 'frame' (16 * 32b)
// Word n : bit [31:0] - frame[n]
// 0x000 : Control signals
// bit 0 - ap_start (Read/Write/COH)
// bit 1 - ap_done (Read/COR)
// bit 2 - ap_idle (Read)
// bit 3 - ap_ready (Read)
// bit 7 - auto_restart (Read/Write)
// others - reserved
// 0x004 : Global Interrupt Enable Register
// bit 0 - Global Interrupt Enable (Read/Write)
// others - reserved
// 0x008 : IP Interrupt Enable Register (Read/Write)
// bit 0 - Channel 0 (ap_done)
// bit 1 - Channel 1 (ap_ready)
// others - reserved
// 0x00c : IP Interrupt Status Register (Read/TOW)
// bit 0 - Channel 0 (ap_done)
// bit 1 - Channel 1 (ap_ready)
// others - reserved
// 0x010 : Data signal of frame_size
// bit 31~0 - frame_size[31:0] (Read/Write)
// 0x014 : reserved
// 0x200 : Data signal of status
// bit 31~0 - status[31:0] (Read)
// 0x204 : Control signal of status
// bit 0 - status_ap_vld (Read/COR)
// others - reserved
// 0x100 ~
// 0x1ff : Memory 'frame' (64 * 32b)
// Word n : bit [31:0] - frame[n]
// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
#define XGPU2RTDS_CTRL_ADDR_AP_CTRL 0x00
#define XGPU2RTDS_CTRL_ADDR_GIE 0x04
#define XGPU2RTDS_CTRL_ADDR_IER 0x08
#define XGPU2RTDS_CTRL_ADDR_ISR 0x0c
#define XGPU2RTDS_CTRL_ADDR_FRAME_SIZE_DATA 0x10
#define XGPU2RTDS_CTRL_ADDR_AP_CTRL 0x000
#define XGPU2RTDS_CTRL_ADDR_GIE 0x004
#define XGPU2RTDS_CTRL_ADDR_IER 0x008
#define XGPU2RTDS_CTRL_ADDR_ISR 0x00c
#define XGPU2RTDS_CTRL_ADDR_FRAME_SIZE_DATA 0x010
#define XGPU2RTDS_CTRL_BITS_FRAME_SIZE_DATA 32
#define XGPU2RTDS_CTRL_ADDR_STATUS_DATA 0x80
#define XGPU2RTDS_CTRL_ADDR_STATUS_DATA 0x200
#define XGPU2RTDS_CTRL_BITS_STATUS_DATA 32
#define XGPU2RTDS_CTRL_ADDR_STATUS_CTRL 0x84
#define XGPU2RTDS_CTRL_ADDR_FRAME_BASE 0x40
#define XGPU2RTDS_CTRL_ADDR_FRAME_HIGH 0x7f
#define XGPU2RTDS_CTRL_ADDR_STATUS_CTRL 0x204
#define XGPU2RTDS_CTRL_ADDR_FRAME_BASE 0x100
#define XGPU2RTDS_CTRL_ADDR_FRAME_HIGH 0x1ff
#define XGPU2RTDS_CTRL_WIDTH_FRAME 32
#define XGPU2RTDS_CTRL_DEPTH_FRAME 16
#define XGPU2RTDS_CTRL_DEPTH_FRAME 64
......@@ -30,8 +30,14 @@ public:
MemoryTranslation(uintptr_t src, uintptr_t dst, size_t size) :
src(src), dst(dst), size(size) {}
uintptr_t
getLocalAddr(uintptr_t addrInForeignAddrSpace) const;
template<typename ReturnType = uintptr_t>
ReturnType
getLocalAddr(uintptr_t addrInForeignAddrSpace) const
{
assert(addrInForeignAddrSpace >= dst);
assert(addrInForeignAddrSpace < (dst + size));
return reinterpret_cast<ReturnType>(src + addrInForeignAddrSpace - dst);
}
uintptr_t
getForeignAddr(uintptr_t addrInLocalAddrSpace) const;
......
......@@ -9,6 +9,8 @@ namespace utils {
std::vector<std::string>
tokenize(std::string s, std::string delimiter);
std::string
join(std::vector<std::string> strings, std::string delimiter);
template<typename T>
void
......
......@@ -148,13 +148,6 @@ MemoryManager::pathCheck(const MemoryGraph::Path& path)
return true;
}
uintptr_t
MemoryTranslation::getLocalAddr(uintptr_t addrInForeignAddrSpace) const
{
assert(addrInForeignAddrSpace >= dst);
assert(addrInForeignAddrSpace < (dst + size));
return src + addrInForeignAddrSpace - dst;
}
uintptr_t
MemoryTranslation::getForeignAddr(uintptr_t addrInLocalAddrSpace) const
......
......@@ -31,5 +31,24 @@ tokenize(std::string s, std::string delimiter)
return tokens;
}
std::string
join(std::vector<std::string> strings, std::string delimiter)
{
std::string out;
for(size_t i = 0; i < strings.size(); i++) {
const auto& s = strings[i];
if(s.length() > 0) {
out += strings[i];
if(i < (strings.size() - 1) and strings[i+1].length() > 0)
out += delimiter;
}
}
return out;
}
} // namespace utils
} // namespace villas
......@@ -30,6 +30,9 @@ GpuAllocator::GpuAllocator(Gpu& gpu) :
if(cudaFree(reinterpret_cast<void*>(mem->getOffset())) != cudaSuccess) {
logger->warn("cudaFree() failed for {:#x} of size {:#x}",
mem->getOffset(), mem->getSize());
} else {
logger->debug("cudaFree({:#x}) successfull (size was {:#x})",
mem->getOffset(), mem->getSize());
}
removeMemoryBlock(*mem);
......@@ -268,16 +271,20 @@ bool Gpu::makeAccessibleToPCIeAndVA(const MemoryBlock& mem)
mm.getProcessAddressSpace(), mem.getAddrSpaceId());
// retrieve bus address
uint64_t addr[8];
ret = gdr_map_dma(pImpl->gdr, mh, 3, 0, 0, addr, 8);
static constexpr size_t maxDmaAddresses = 1 << 10;
uint64_t addr[maxDmaAddresses];
// WARNING: PCI slot of FPGA hardcoded here! In the end this is fed into the
// Nvidia kernel module and god knows what happens if we provide
// an invalid pci dev here. At least there is no documention. The
// BAR address should not be any different, but ... ?!
ret = gdr_map_dma(pImpl->gdr, mh, 3, 0, 0, addr, maxDmaAddresses);
for(int i = 0; i < ret; i++) {
logger->debug("DMA addr[{}]: {:#x}", i, addr[i]);
}
if(ret != 1) {
logger->error("Only one DMA address per block supported at the moment");
return false;
logger->warn("Only one DMA address per block supported at the moment, use first");
}
// mapping to access memory block from peer devices via PCIe
......@@ -358,6 +365,9 @@ Gpu::translate(const MemoryBlock& dst)
return mm.getTranslation(masterPciEAddrSpaceId, dst.getAddrSpaceId());
}
static long roundUp(long n, long m) {
return n >= 0 ? ((n + m - 1) / m) * m : (n / m) * m;
}
std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
GpuAllocator::allocateBlock(size_t size)
......@@ -382,7 +392,8 @@ GpuAllocator::allocateBlock(size_t size)
// allocate a new chunk
// rounded-up multiple of GPU page size
const size_t chunkSize = size - (size & (GpuPageSize - 1)) + GpuPageSize;
const size_t chunkSize = roundUp(size, GpuPageSize);
logger->debug("Allocate new chunk of {:#x} bytes", chunkSize);
if(cudaSuccess != cudaMalloc(&addr, chunkSize)) {
......
......@@ -39,21 +39,23 @@ Gpu2Rtds::startOnce(size_t frameSize)
return true;
}
void Gpu2Rtds::dump(spdlog::level::level_enum logLevel)
bool Gpu2Rtds::dumpLog(spdlog::level::level_enum logLevel)
{
const auto frame_size = *registerFrameSize;
auto status = *registerStatus;
logger->log(logLevel, "Gpu2Rtds registers:");
logger->log(logLevel, " Frame size (words): {:#x}", frame_size);
logger->log(logLevel, " Frame size (words): {}", frame_size);
logger->log(logLevel, " Status: {:#x}", status.value);
logger->log(logLevel, " Running: {}", (status.is_running ? "yes" : "no"));
logger->log(logLevel, " Frame too short: {}", (status.frame_too_short ? "yes" : "no"));
logger->log(logLevel, " Frame too long: {}", (status.frame_too_long ? "yes" : "no"));
logger->log(logLevel, " Frame size invalid: {}", (status.invalid_frame_size ? "yes" : "no"));
logger->log(logLevel, " Last count: {}", status.last_count);
logger->log(logLevel, " Last count: {}", getStatusLastCount(status));
logger->log(logLevel, " Last seq. number: {}", status.last_seq_nr);
logger->log(logLevel, " Max. frame size: {}", status.max_frame_size);
logger->log(logLevel, " Max. frame size: {}", getStatusMaxFrameSize(status));
return true;
}
//bool Gpu2Rtds::startOnce(const MemoryBlock& mem, size_t frameSize, size_t dataOffset, size_t doorbellOffset)
......@@ -118,7 +120,7 @@ Gpu2Rtds::getMaxFrameSize()
// assert(status.max_frame_size == (*registerStatus).max_frame_size);
return status.max_frame_size;
return getStatusMaxFrameSize(status);
}
//void
......
......@@ -23,7 +23,7 @@ bool Rtds2Gpu::init()
started = false;
// maxFrameSize = getMaxFrameSize();
maxFrameSize = 16;
maxFrameSize = 64;
logger->info("Max. frame size supported: {}", maxFrameSize);
return true;
......@@ -31,26 +31,33 @@ bool Rtds2Gpu::init()
void Rtds2Gpu::dump(spdlog::level::level_enum logLevel)
bool Rtds2Gpu::dumpLog(spdlog::level::level_enum logLevel)
{
const auto baseaddr = XRtds2gpu_Get_baseaddr(&xInstance);
const auto data_offset = XRtds2gpu_Get_data_offset(&xInstance);
const auto doorbell_offset = XRtds2gpu_Get_doorbell_offset(&xInstance);
const auto frame_size = XRtds2gpu_Get_frame_size(&xInstance);
if(not updateStatus()) {
logger->warn("Couldn't read status register (not ready), values may be wrong");
return false;
}
logger->log(logLevel, "Rtds2Gpu registers (IP base {:#x}):", xInstance.Ctrl_BaseAddress);
logger->log(logLevel, " Base address (bytes): {:#x}", baseaddr);
logger->log(logLevel, " Doorbell offset (bytes): {:#x}", doorbell_offset);
logger->log(logLevel, " Data offset (bytes): {:#x}", data_offset);
logger->log(logLevel, " Frame size (words): {:#x}", frame_size);
logger->log(logLevel, " Frame size (words): {}", frame_size);
logger->log(logLevel, " Status: {:#x}", status.value);
logger->log(logLevel, " Running: {}", (status.is_running ? "yes" : "no"));
logger->log(logLevel, " Frame too short: {}", (status.frame_too_short ? "yes" : "no"));
logger->log(logLevel, " Frame too long: {}", (status.frame_too_long ? "yes" : "no"));
logger->log(logLevel, " Frame size invalid: {}", (status.invalid_frame_size ? "yes" : "no"));
logger->log(logLevel, " Last count: {}", status.last_count);
logger->log(logLevel, " Last count: {}", getStatusLastCount(status));
logger->log(logLevel, " Last seq. number: {}", status.last_seq_nr);
logger->log(logLevel, " Max. frame size: {}", status.max_frame_size);
logger->log(logLevel, " Max. frame size: {}", getStatusMaxFrameSize(status));
return true;
}
bool Rtds2Gpu::startOnce(const MemoryBlock& mem, size_t frameSize, size_t dataOffset, size_t doorbellOffset)
......@@ -107,7 +114,7 @@ Rtds2Gpu::getMaxFrameSize()
while(not isFinished());
updateStatus();
return status.max_frame_size;
return getStatusMaxFrameSize(status);
}
void
......@@ -117,7 +124,7 @@ Rtds2Gpu::dumpDoorbell(uint32_t doorbellRegister) const
logger->info("Doorbell register: {:#08x}", doorbell.value);
logger->info(" Valid: {}", (doorbell.is_valid ? "yes" : "no"));
logger->info(" Count: {}", doorbell.count);
logger->info(" Count: {}", getDoorbellCount(doorbell));
logger->info(" Seq. number: {}", doorbell.seq_nr);
}
......
......@@ -99,8 +99,8 @@ Test(fpga, rtds2gpu_loopback_dma, .description = "Rtds2Gpu")
cr_assert_not_null(gpu2rtds, "No Gpu2Rtds IP found");
cr_assert_not_null(rtds, "RTDS IP not found");
rtds2gpu.dump(spdlog::level::debug);
gpu2rtds->dump(spdlog::level::debug);
rtds2gpu.dumpLog(spdlog::level::debug);
gpu2rtds->dumpLog(spdlog::level::debug);
/* Allocate and prepare memory */
......@@ -139,7 +139,7 @@ Test(fpga, rtds2gpu_loopback_dma, .description = "Rtds2Gpu")
while(not rtds2gpu.isFinished());
const uint32_t* doorbellDst = &dmaMemDst[DOORBELL_OFFSET];
rtds2gpu.dump(spdlog::level::info);
rtds2gpu.dumpLog(spdlog::level::info);
rtds2gpu.dumpDoorbell(*doorbellDst);
cr_assert(memcmp(dataSrc, dataDst, FRAME_SIZE) == 0, "Memory not equal");
......