Skip to content
Snippets Groups Projects
Commit c6cb9371 authored by Niklas Eiling's avatar Niklas Eiling
Browse files

add overhead.cu that tests the overhead of single CUDA calls

parent bb46df45
No related branches found
No related tags found
No related merge requests found
Pipeline #1040546 failed
......@@ -5,7 +5,7 @@ HOST_LD = gcc
HOST_CFLAGS = -Wall -std=gnu99
CC = nvcc -ccbin g++
ARCH = sm_61
ARCH = sm_80
CFLAGS = -arch=$(ARCH) -cudart shared
#CFLAGS = -arch=$(ARCH)
LD = nvcc -ccbin g++
......@@ -19,6 +19,8 @@ TEST_API_O = test_api.o
TEST_KERNEL_BIN = kernel.testapp
TEST_KERNEL_O = test_kernel.o
BINARY = cricket.testapp
OVERHEAD_BIN = overhead.testapp
OVERHEAD_O = overhead.o
TEST_KERNEL_LIB_O = test_kernel_lib.o
TEST_KERNEL_LIB = test_kernel.so
......@@ -58,6 +60,12 @@ $(TEST_KERNEL_BIN) : $(TEST_KERNEL_O)
$(BINARY) : $(FILES)
$(LD) $(LDFLAGS) -o $@ $^
$(OVERHEAD_O) : $(OVERHEAD_O:.o=.cu)
$(CC) -O2 $(CFLAGS) -dc -o $@ $<
$(OVERHEAD_BIN) : $(OVERHEAD_O)
$(LD) $(LDFLAGS) -O2 -o $@ $^
$(LIBCUDA_OBJ) : $(LIBCUDA_OBJ:.o=.c)
$(HOST_CC) -c -fpic -o $@ $< $(LIBCUDA_LIBS)
......
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#include <cuda_runtime.h>
#define ITERATIONS 10
#define MEMSIZE 1024*1024
const int blocksize = 32;
__global__
void kernel(uint16_t *A, uint16_t *x, uint16_t *res, char b, short c, int a, long long int d)
{
int i = threadIdx.x;
res[i] = A[i] * x[i];
}
__global__
void kernel_no_param(void)
{
}
int main()
{
struct timeval begin, end;
int iterations = ITERATIONS;
int cnt;
printf("init CUDA\n");
cudaGetDeviceCount(&cnt);
printf("cudaGetDeviceCount (%d iterations)\n", iterations);
gettimeofday(&begin, NULL);
for (int i=0; i != iterations; i++) {
cudaGetDeviceCount(&cnt);
}
gettimeofday(&end, NULL);
printf("TOTALTIME: %0u.%06u\n\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec));
uint16_t *dev_A;
size_t A_size = MEMSIZE;
printf("cudaMalloc/cudaFree (%d iterations)\n", iterations);
gettimeofday(&begin, NULL);
for (int i=0; i != iterations; i++) {
cudaMalloc( (void**)&dev_A, A_size );
cudaFree( dev_A );
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
printf("TOTALTIME: %0u.%06u\n\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec));
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1);
printf("kernel launch w/o parameteter (%d iterations)\n", iterations);
gettimeofday(&begin, NULL);
for (int i=0; i != iterations; i++) {
kernel_no_param<<<dimGrid, dimBlock>>>();
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
cudaError_t result = cudaGetLastError();
printf("\nresult: %d\n", result);
printf("TOTALTIME: %0u.%06u\n\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec));
uint16_t *dev_x;
uint16_t *dev_res;
cudaMalloc( (void**)&dev_x, MEMSIZE );
cudaMalloc( (void**)&dev_res, sizeof(int) );
cudaMalloc( (void**)&dev_A, MEMSIZE );
cudaMemset( dev_A, 1, MEMSIZE);
cudaMemset( dev_x, 2, MEMSIZE);
printf("kernel launch w/ parameteters (%d iterations)\n", iterations);
gettimeofday(&begin, NULL);
for (int i=0; i != iterations; i++) {
kernel<<<dimGrid, dimBlock>>>(dev_A, dev_x, dev_res, 0, 0, 0, 0);
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
result = cudaGetLastError();
printf("\nresult: %d\n", result);
printf("TOTALTIME: %0u.%06u\n\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec));
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment