Commit a87df1b5 authored by Steffen Vogel's avatar Steffen Vogel 🎅🏼

Merge branch 'feature/hls-rtds2gpu' into develop

parents 86fab248 90cb9374
Subproject commit dd7d75d0aab3801d65f9ff757d82f47f705514af
Subproject commit 9747c6ead6dedff943dbf22ce74e40e9b2622514
This diff is collapsed.
......@@ -71,8 +71,8 @@ public:
const std::string& port,
bool isMaster)
{
for(auto& [vertexId, vertex] : vertices) {
(void) vertexId;
for(auto& vertexEntry : vertices) {
auto& vertex = vertexEntry.second;
if(vertex->nodeName == node and vertex->portName == port and vertex->isMaster == isMaster)
return vertex;
}
......@@ -86,7 +86,7 @@ public:
};
class IpNode : public IpCore {
class IpNode : public virtual IpCore {
public:
friend class IpNodeFactory;
......
......@@ -43,7 +43,10 @@ public:
bool init();
bool reset();
// memory-mapped to stream (MM2S)
bool write(const MemoryBlock& mem, size_t len);
// stream to memory-mapped (S2MM)
bool read(const MemoryBlock& mem, size_t len);
size_t writeComplete()
......
#pragma once
#include <villas/memory.hpp>
#include <villas/fpga/ip_node.hpp>
#include <villas/fpga/ips/hls.hpp>
#include <villas/fpga/ips/rtds2gpu/register_types.hpp>
#include <villas/fpga/ips/rtds2gpu/xgpu2rtds_hw.h>
namespace villas {
namespace fpga {
namespace ip {
class Gpu2Rtds : public IpNode, public Hls
{
public:
friend class Gpu2RtdsFactory;
bool init();
void dump(spdlog::level::level_enum logLevel = spdlog::level::info);
bool startOnce(size_t frameSize);
size_t getMaxFrameSize();
const StreamVertex&
getDefaultMasterPort() const
{ return getMasterPort(rtdsOutputStreamPort); }
MemoryBlock
getRegisterMemory() const
{ return MemoryBlock(0, 1 << 10, getAddressSpaceId(registerMemory)); }
private:
bool updateStatus();
public:
static constexpr const char* rtdsOutputStreamPort = "rtds_output";
struct StatusControlRegister { uint32_t
status_ap_vld : 1,
_res : 31;
};
using StatusRegister = axilite_reg_status_t;
static constexpr uintptr_t registerStatusOffset = XGPU2RTDS_CTRL_ADDR_STATUS_DATA;
static constexpr uintptr_t registerStatusCtrlOffset = XGPU2RTDS_CTRL_ADDR_STATUS_CTRL;
static constexpr uintptr_t registerFrameSizeOffset = XGPU2RTDS_CTRL_ADDR_FRAME_SIZE_DATA;
static constexpr uintptr_t registerFrameOffset = XGPU2RTDS_CTRL_ADDR_FRAME_BASE;
static constexpr uintptr_t registerFrameLength = XGPU2RTDS_CTRL_DEPTH_FRAME;
public:
StatusRegister* registerStatus;
StatusControlRegister* registerStatusCtrl;
uint32_t* registerFrameSize;
uint32_t* registerFrames;
size_t maxFrameSize;
bool started;
};
class Gpu2RtdsFactory : public IpNodeFactory {
public:
Gpu2RtdsFactory();
IpCore* create()
{ return new Gpu2Rtds; }
std::string
getName() const
{ return "Gpu2Rtds"; }
std::string
getDescription() const
{ return "HLS Gpu2Rtds IP"; }
Vlnv getCompatibleVlnv() const
{ return {"acs.eonerc.rwth-aachen.de:hls:gpu2rtds:"}; }
};
} // namespace ip
} // namespace fpga
} // namespace villas
#pragma once
#include <villas/memory.hpp>
#include <villas/fpga/ip_node.hpp>
namespace villas {
namespace fpga {
namespace ip {
class Hls : public virtual IpCore
{
public:
virtual bool init()
{
auto& registers = addressTranslations.at(registerMemory);
controlRegister = reinterpret_cast<ControlRegister*>(registers.getLocalAddr(registerControlAddr));
globalIntRegister = reinterpret_cast<GlobalIntRegister*>(registers.getLocalAddr(registerGlobalIntEnableAddr));
ipIntEnableRegister = reinterpret_cast<IpIntRegister*>(registers.getLocalAddr(registerIntEnableAddr));
ipIntStatusRegister = reinterpret_cast<IpIntRegister*>(registers.getLocalAddr(registerIntStatusAddr));
setAutoRestart(false);
setGlobalInterrupt(false);
return true;
}
bool start()
{
controlRegister->ap_start = true;
running = true;
return true;
}
virtual bool isFinished()
{ updateRunningStatus(); return !running; }
bool isRunning()
{ updateRunningStatus(); return running; }
void setAutoRestart(bool enabled) const
{ controlRegister->auto_restart = enabled; }
void setGlobalInterrupt(bool enabled) const
{ globalIntRegister->globalInterruptEnable = enabled; }
void setReadyInterrupt(bool enabled) const
{ ipIntEnableRegister->ap_ready = enabled; }
void setDoneInterrupt(bool enabled) const
{ ipIntEnableRegister->ap_done = enabled; }
bool isIdleBit() const
{ return controlRegister->ap_idle; }
bool isReadyBit() const
{ return controlRegister->ap_ready; }
/// Warning: the corresponding bit is cleared on read of the register, so if
/// not used correctly, this function may never return true. Only use this
/// function if you really know what you are doing!
bool isDoneBit() const
{ return controlRegister->ap_done; }
bool isAutoRestartBit() const
{ return controlRegister->auto_restart; }
private:
void updateRunningStatus()
{
if(running and isIdleBit())
running = false;
}
protected:
/* Memory block handling */
static constexpr const char* registerMemory = "Reg";
virtual std::list<MemoryBlockName> getMemoryBlocks() const
{ return { registerMemory }; }
public:
/* Register definitions */
static constexpr uintptr_t registerControlAddr = 0x00;
static constexpr uintptr_t registerGlobalIntEnableAddr = 0x04;
static constexpr uintptr_t registerIntEnableAddr = 0x08;
static constexpr uintptr_t registerIntStatusAddr = 0x0c;
union ControlRegister {
uint32_t value;
struct { uint32_t
ap_start : 1,
ap_done : 1,
ap_idle : 1,
ap_ready : 1,
_res1 : 3,
auto_restart : 1,
_res2 : 24;
};
};
struct GlobalIntRegister { uint32_t
globalInterruptEnable : 1,
_res : 31;
};
struct IpIntRegister { uint32_t
ap_done : 1,
ap_ready : 1,
_res : 30;
};
protected:
ControlRegister* controlRegister;
GlobalIntRegister* globalIntRegister;
IpIntRegister* ipIntEnableRegister;
IpIntRegister* ipIntStatusRegister;
bool running;
};
} // namespace ip
} // namespace fpga
} // namespace villas
......@@ -44,6 +44,14 @@ public:
std::list<std::string> getMemoryBlocks() const
{ return { registerMemory }; }
const StreamVertex&
getDefaultSlavePort() const
{ return getSlavePort(slavePort); }
const StreamVertex&
getDefaultMasterPort() const
{ return getMasterPort(masterPort); }
private:
static constexpr const char registerMemory[] = "reg0";
static constexpr const char* irqTs = "irq_ts";
......
#pragma once
#include <villas/memory.hpp>
#include <villas/fpga/ip_node.hpp>
#include <villas/fpga/ips/hls.hpp>
#include "rtds2gpu/xrtds2gpu.h"
#include "rtds2gpu/register_types.hpp"
namespace villas {
namespace fpga {
namespace ip {
union ControlRegister {
uint32_t value;
struct { uint32_t
ap_start : 1,
ap_done : 1,
ap_idle : 1,
ap_ready : 1,
_res1 : 3,
auto_restart : 1,
_res2 : 24;
};
};
class Rtds2Gpu : public IpNode, public Hls
{
public:
friend class Rtds2GpuFactory;
bool init();
void dump(spdlog::level::level_enum logLevel = spdlog::level::info);
bool startOnce(const MemoryBlock& mem, size_t frameSize, size_t dataOffset, size_t doorbellOffset);
size_t getMaxFrameSize();
void dumpDoorbell(uint32_t doorbellRegister) const;
bool doorbellIsValid(const uint32_t& doorbellRegister) const
{ return reinterpret_cast<const reg_doorbell_t&>(doorbellRegister).is_valid; }
void doorbellReset(uint32_t& doorbellRegister) const
{ doorbellRegister = 0; }
static constexpr const char* registerMemory = "Reg";
std::list<MemoryBlockName> getMemoryBlocks() const
{ return { registerMemory }; }
const StreamVertex&
getDefaultSlavePort() const
{ return getSlavePort(rtdsInputStreamPort); }
private:
bool updateStatus();
private:
static constexpr const char* axiInterface = "m_axi_axi_mm";
static constexpr const char* rtdsInputStreamPort = "rtds_input";
XRtds2gpu xInstance;
axilite_reg_status_t status;
size_t maxFrameSize;
bool started;
};
class Rtds2GpuFactory : public IpNodeFactory {
public:
Rtds2GpuFactory();
IpCore* create()
{ return new Rtds2Gpu; }
std::string
getName() const
{ return "Rtds2Gpu"; }
std::string
getDescription() const
{ return "HLS RTDS2GPU IP"; }
Vlnv getCompatibleVlnv() const
{ return {"acs.eonerc.rwth-aachen.de:hls:rtds2gpu:"}; }
};
} // namespace ip
} // namespace fpga
} // namespace villas
#ifndef REGISTER_TYPES_H
#define REGISTER_TYPES_H
#include <stdint.h>
#include <cstddef>
#include <cstdint>
union axilite_reg_status_t {
uint32_t value;
struct {
uint32_t
last_seq_nr : 16,
last_count : 6,
max_frame_size : 6,
invalid_frame_size : 1,
frame_too_short : 1,
frame_too_long : 1,
is_running : 1;
};
};
union reg_doorbell_t {
uint32_t value;
struct {
uint32_t
seq_nr : 16,
count : 6,
is_valid : 1;
};
constexpr reg_doorbell_t() : value(0) {}
};
template<size_t N, typename T = uint32_t>
struct Rtds2GpuMemoryBuffer {
// this type is only for memory interpretation, it makes no sense to create
// an instance so it's forbidden
Rtds2GpuMemoryBuffer() = delete;
// T can be a more complex type that wraps multiple values
static constexpr size_t rawValueCount = N * (sizeof(T) / 4);
// As of C++14, offsetof() is not working for non-standard layout types (i.e.
// composed of non-POD members). This might work in C++17 though.
// More info: https://gist.github.com/graphitemaster/494f21190bb2c63c5516
//static constexpr size_t doorbellOffset = offsetof(Rtds2GpuMemoryBuffer, doorbell);
//static constexpr size_t dataOffset = offsetof(Rtds2GpuMemoryBuffer, data);
// HACK: This might break horribly, let's just hope C++17 will be there soon
static constexpr size_t dataOffset = 0;
static constexpr size_t doorbellOffset = N * sizeof(Rtds2GpuMemoryBuffer::data);
T data[N];
reg_doorbell_t doorbell;
};
#endif // REGISTER_TYPES_H
// ==============================================================
// File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC
// Version: 2017.3
// Copyright (C) 1986-2017 Xilinx, Inc. All Rights Reserved.
//
// ==============================================================
// CTRL
// 0x00 : Control signals
// bit 0 - ap_start (Read/Write/COH)
// bit 1 - ap_done (Read/COR)
// bit 2 - ap_idle (Read)
// bit 3 - ap_ready (Read)
// bit 7 - auto_restart (Read/Write)
// others - reserved
// 0x04 : Global Interrupt Enable Register
// bit 0 - Global Interrupt Enable (Read/Write)
// others - reserved
// 0x08 : IP Interrupt Enable Register (Read/Write)
// bit 0 - Channel 0 (ap_done)
// bit 1 - Channel 1 (ap_ready)
// others - reserved
// 0x0c : IP Interrupt Status Register (Read/TOW)
// bit 0 - Channel 0 (ap_done)
// bit 1 - Channel 1 (ap_ready)
// others - reserved
// 0x10 : Data signal of frame_size
// bit 31~0 - frame_size[31:0] (Read/Write)
// 0x14 : reserved
// 0x80 : Data signal of status
// bit 31~0 - status[31:0] (Read)
// 0x84 : Control signal of status
// bit 0 - status_ap_vld (Read/COR)
// others - reserved
// 0x40 ~
// 0x7f : Memory 'frame' (16 * 32b)
// Word n : bit [31:0] - frame[n]
// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
#define XGPU2RTDS_CTRL_ADDR_AP_CTRL 0x00
#define XGPU2RTDS_CTRL_ADDR_GIE 0x04
#define XGPU2RTDS_CTRL_ADDR_IER 0x08
#define XGPU2RTDS_CTRL_ADDR_ISR 0x0c
#define XGPU2RTDS_CTRL_ADDR_FRAME_SIZE_DATA 0x10
#define XGPU2RTDS_CTRL_BITS_FRAME_SIZE_DATA 32
#define XGPU2RTDS_CTRL_ADDR_STATUS_DATA 0x80
#define XGPU2RTDS_CTRL_BITS_STATUS_DATA 32
#define XGPU2RTDS_CTRL_ADDR_STATUS_CTRL 0x84
#define XGPU2RTDS_CTRL_ADDR_FRAME_BASE 0x40
#define XGPU2RTDS_CTRL_ADDR_FRAME_HIGH 0x7f
#define XGPU2RTDS_CTRL_WIDTH_FRAME 32
#define XGPU2RTDS_CTRL_DEPTH_FRAME 16
// ==============================================================
// File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC
// Version: 2017.3
// Copyright (C) 1986-2017 Xilinx, Inc. All Rights Reserved.
//
// ==============================================================
#ifndef XRTDS2GPU_H
#define XRTDS2GPU_H
#ifdef __cplusplus
extern "C" {
#endif
/***************************** Include Files *********************************/
#ifndef __linux__
#include "xil_types.h"
#include "xil_assert.h"
#include "xstatus.h"
#include "xil_io.h"
#else
#include <stdint.h>
#include <assert.h>
#include <dirent.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stddef.h>
#endif
#include "xrtds2gpu_hw.h"
/**************************** Type Definitions ******************************/
#ifdef __linux__
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
#else
typedef struct {
u16 DeviceId;
u32 Ctrl_BaseAddress;
} XRtds2gpu_Config;
#endif
typedef struct {
u32 Ctrl_BaseAddress;
u32 IsReady;
} XRtds2gpu;
/***************** Macros (Inline Functions) Definitions *********************/
#ifndef __linux__
#define XRtds2gpu_WriteReg(BaseAddress, RegOffset, Data) \
Xil_Out32((BaseAddress) + (RegOffset), (u32)(Data))
#define XRtds2gpu_ReadReg(BaseAddress, RegOffset) \
Xil_In32((BaseAddress) + (RegOffset))
#else
#define XRtds2gpu_WriteReg(BaseAddress, RegOffset, Data) \
*(volatile u32*)((BaseAddress) + (RegOffset)) = (u32)(Data)
#define XRtds2gpu_ReadReg(BaseAddress, RegOffset) \
*(volatile u32*)((BaseAddress) + (RegOffset))
#define Xil_AssertVoid(expr) assert(expr)
#define Xil_AssertNonvoid(expr) assert(expr)
#define XST_SUCCESS 0
#define XST_DEVICE_NOT_FOUND 2
#define XST_OPEN_DEVICE_FAILED 3
#define XIL_COMPONENT_IS_READY 1
#endif
/************************** Function Prototypes *****************************/
#ifndef __linux__
int XRtds2gpu_Initialize(XRtds2gpu *InstancePtr, u16 DeviceId);
XRtds2gpu_Config* XRtds2gpu_LookupConfig(u16 DeviceId);
int XRtds2gpu_CfgInitialize(XRtds2gpu *InstancePtr, XRtds2gpu_Config *ConfigPtr);
#else
int XRtds2gpu_Initialize(XRtds2gpu *InstancePtr, const char* InstanceName);
int XRtds2gpu_Release(XRtds2gpu *InstancePtr);
#endif
void XRtds2gpu_Start(XRtds2gpu *InstancePtr);
u32 XRtds2gpu_IsDone(XRtds2gpu *InstancePtr);
u32 XRtds2gpu_IsIdle(XRtds2gpu *InstancePtr);
u32 XRtds2gpu_IsReady(XRtds2gpu *InstancePtr);
void XRtds2gpu_EnableAutoRestart(XRtds2gpu *InstancePtr);
void XRtds2gpu_DisableAutoRestart(XRtds2gpu *InstancePtr);
void XRtds2gpu_Set_baseaddr(XRtds2gpu *InstancePtr, u32 Data);
u32 XRtds2gpu_Get_baseaddr(XRtds2gpu *InstancePtr);
void XRtds2gpu_Set_data_offset(XRtds2gpu *InstancePtr, u32 Data);
u32 XRtds2gpu_Get_data_offset(XRtds2gpu *InstancePtr);
void XRtds2gpu_Set_doorbell_offset(XRtds2gpu *InstancePtr, u32 Data);
u32 XRtds2gpu_Get_doorbell_offset(XRtds2gpu *InstancePtr);
void XRtds2gpu_Set_frame_size(XRtds2gpu *InstancePtr, u32 Data);
u32 XRtds2gpu_Get_frame_size(XRtds2gpu *InstancePtr);
u32 XRtds2gpu_Get_status(XRtds2gpu *InstancePtr);
u32 XRtds2gpu_Get_status_vld(XRtds2gpu *InstancePtr);
void XRtds2gpu_InterruptGlobalEnable(XRtds2gpu *InstancePtr);
void XRtds2gpu_InterruptGlobalDisable(XRtds2gpu *InstancePtr);
void XRtds2gpu_InterruptEnable(XRtds2gpu *InstancePtr, u32 Mask);
void XRtds2gpu_InterruptDisable(XRtds2gpu *InstancePtr, u32 Mask);
void XRtds2gpu_InterruptClear(XRtds2gpu *InstancePtr, u32 Mask);
u32 XRtds2gpu_InterruptGetEnabled(XRtds2gpu *InstancePtr);
u32 XRtds2gpu_InterruptGetStatus(XRtds2gpu *InstancePtr);
#ifdef __cplusplus
}
#endif
#endif
// ==============================================================
// File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC
// Version: 2017.3
// Copyright (C) 1986-2017 Xilinx, Inc. All Rights Reserved.
//
// ==============================================================
// CTRL
// 0x00 : Control signals
// bit 0 - ap_start (Read/Write/COH)
// bit 1 - ap_done (Read/COR)
// bit 2 - ap_idle (Read)
// bit 3 - ap_ready (Read)
// bit 7 - auto_restart (Read/Write)
// others - reserved
// 0x04 : Global Interrupt Enable Register
// bit 0 - Global Interrupt Enable (Read/Write)
// others - reserved
// 0x08 : IP Interrupt Enable Register (Read/Write)
// bit 0 - Channel 0 (ap_done)
// bit 1 - Channel 1 (ap_ready)
// others - reserved
// 0x0c : IP Interrupt Status Register (Read/TOW)
// bit 0 - Channel 0 (ap_done)
// bit 1 - Channel 1 (ap_ready)
// others - reserved
// 0x10 : Data signal of baseaddr
// bit 31~0 - baseaddr[31:0] (Read/Write)
// 0x14 : reserved
// 0x18 : Data signal of data_offset
// bit 31~0 - data_offset[31:0] (Read/Write)
// 0x1c : reserved
// 0x20 : Data signal of doorbell_offset
// bit 31~0 - doorbell_offset[31:0] (Read/Write)
// 0x24 : reserved
// 0x28 : Data signal of frame_size
// bit 31~0 - frame_size[31:0] (Read/Write)
// 0x2c : reserved
// 0x30 : Data signal of status
// bit 31~0 - status[31:0] (Read)
// 0x34 : Control signal of status
// bit 0 - status_ap_vld (Read/COR)
// others - reserved
// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
#define XRTDS2GPU_CTRL_ADDR_AP_CTRL 0x00
#define XRTDS2GPU_CTRL_ADDR_GIE 0x04
#define XRTDS2GPU_CTRL_ADDR_IER 0x08
#define XRTDS2GPU_CTRL_ADDR_ISR 0x0c
#define XRTDS2GPU_CTRL_ADDR_BASEADDR_DATA 0x10
#define XRTDS2GPU_CTRL_BITS_BASEADDR_DATA 32
#define XRTDS2GPU_CTRL_ADDR_DATA_OFFSET_DATA 0x18
#define XRTDS2GPU_CTRL_BITS_DATA_OFFSET_DATA 32
#define XRTDS2GPU_CTRL_ADDR_DOORBELL_OFFSET_DATA 0x20
#define XRTDS2GPU_CTRL_BITS_DOORBELL_OFFSET_DATA 32
#define XRTDS2GPU_CTRL_ADDR_FRAME_SIZE_DATA 0x28
#define XRTDS2GPU_CTRL_BITS_FRAME_SIZE_DATA 32
#define XRTDS2GPU_CTRL_ADDR_STATUS_DATA 0x30
#define XRTDS2GPU_CTRL_BITS_STATUS_DATA 32
#define XRTDS2GPU_CTRL_ADDR_STATUS_CTRL 0x34
......@@ -34,8 +34,16 @@ set(SOURCES
ips/dma.cpp
ips/bram.cpp
ips/rtds.cpp
ips/rtds2gpu/rtds2gpu.cpp
ips/rtds2gpu/xrtds2gpu.c
ips/rtds2gpu/gpu2rtds.cpp
)
# we don't have much influence on drivers generated by Xilinx, so ignore warnings
set_source_files_properties(ips/rtds2gpu/xrtds2gpu.c
PROPERTIES COMPILE_FLAGS -Wno-int-to-pointer-cast)
include(FindPkgConfig)
pkg_check_modules(JANSSON jansson)
......
......@@ -374,6 +374,13 @@ void Gpu::memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t si
cudaDeviceSynchronize();
}
MemoryTranslation
Gpu::translate(const MemoryBlock& dst)
{
auto& mm = MemoryManager::get();
return mm.getTranslation(masterPciEAddrSpaceId, dst.getAddrSpaceId());
}
std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
GpuAllocator::allocateBlock(size_t size)
......@@ -381,29 +388,53 @@ GpuAllocator::allocateBlock(size_t size)
cudaSetDevice(gpu.gpuId);
void* addr;
if(cudaSuccess != cudaMalloc(&addr, size)) {
logger->error("cudaMalloc(..., size={}) failed", size);
throw std::bad_alloc();
}
auto& mm = MemoryManager::get();
// assemble name for this block
std::stringstream name;
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
// search for an existing chunk that has enough free memory
auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto& chunk) {
return chunk->getAvailableMemory() >= size;
});
if(chunk != chunks.end()) {
logger->debug("Found existing chunk that can host the requested block");
auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
return (*chunk)->allocateBlock(size);
const auto localAddr = reinterpret_cast<uintptr_t>(addr);
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
} else {
// allocate a new chunk