add overhead.cu that tests the overhead of single CUDA calls

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>

add overhead.cu that tests the overhead of single CUDA calls
c6cb9371 · Niklas Eiling · bb46df45 · c6cb9371 · c6cb9371
Commit c6cb9371 authored 1 year ago by Niklas Eiling
--- a/tests/test_apps/Makefile
+++ b/tests/test_apps/Makefile
@@ -5,7 +5,7 @@ HOST_LD = gcc
 HOST_CFLAGS = -Wall -std=gnu99

 CC = nvcc -ccbin g++
-ARCH = sm_61
+ARCH = sm_80
 CFLAGS = -arch=$(ARCH) -cudart shared
 #CFLAGS = -arch=$(ARCH)
 LD = nvcc -ccbin g++
@@ -19,6 +19,8 @@ TEST_API_O = test_api.o
 TEST_KERNEL_BIN = kernel.testapp
 TEST_KERNEL_O = test_kernel.o
 BINARY = cricket.testapp
+OVERHEAD_BIN = overhead.testapp
+OVERHEAD_O = overhead.o

 TEST_KERNEL_LIB_O = test_kernel_lib.o
 TEST_KERNEL_LIB = test_kernel.so
@@ -58,6 +60,12 @@ $(TEST_KERNEL_BIN) : $(TEST_KERNEL_O)
 $(BINARY) : $(FILES)
 	$(LD) $(LDFLAGS) -o $@ $^

+$(OVERHEAD_O) : $(OVERHEAD_O:.o=.cu)
+	$(CC) -O2 $(CFLAGS) -dc -o $@ $<
+
+$(OVERHEAD_BIN) : $(OVERHEAD_O)
+	$(LD) $(LDFLAGS) -O2 -o $@ $^
+
 $(LIBCUDA_OBJ) : $(LIBCUDA_OBJ:.o=.c)
 	$(HOST_CC) -c -fpic -o $@ $< $(LIBCUDA_LIBS)


--- a/tests/test_apps/overhead.cu
+++ b/tests/test_apps/overhead.cu
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <cuda_runtime.h>
+
+#define ITERATIONS 10
+#define MEMSIZE 1024*1024
+const int blocksize = 32;
+
+__global__
+void kernel(uint16_t *A, uint16_t *x, uint16_t *res, char b, short c, int a, long long int d)
+{
+    int i = threadIdx.x;
+    res[i] = A[i] * x[i];
+}
+
+__global__
+void kernel_no_param(void)
+{
+}
+
+int main()
+{
+    struct timeval begin, end;
+
+    int iterations = ITERATIONS;
+    int cnt;
+
+    printf("init CUDA\n");
+    cudaGetDeviceCount(&cnt);
+    printf("cudaGetDeviceCount (%d iterations)\n", iterations);
+    gettimeofday(&begin, NULL);
+    for (int i=0; i != iterations; i++) {
+        cudaGetDeviceCount(&cnt);
+    }
+    gettimeofday(&end, NULL);
+    printf("TOTALTIME: %0u.%06u\n\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec));
+
+    uint16_t *dev_A;
+    size_t A_size = MEMSIZE;
+    printf("cudaMalloc/cudaFree (%d iterations)\n", iterations);
+    gettimeofday(&begin, NULL);
+    for (int i=0; i != iterations; i++) {
+        cudaMalloc( (void**)&dev_A, A_size );
+        cudaFree( dev_A );
+    }
+    cudaDeviceSynchronize();
+    gettimeofday(&end, NULL);
+    printf("TOTALTIME: %0u.%06u\n\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec));
+
+    dim3 dimBlock( blocksize, 1 );
+    dim3 dimGrid( 1, 1);
+    printf("kernel launch w/o parameteter (%d iterations)\n", iterations);
+    gettimeofday(&begin, NULL);
+    for (int i=0; i != iterations; i++) {
+        kernel_no_param<<<dimGrid, dimBlock>>>();
+    }
+    cudaDeviceSynchronize();
+    gettimeofday(&end, NULL);
+    cudaError_t result = cudaGetLastError();
+    printf("\nresult: %d\n", result);
+    printf("TOTALTIME: %0u.%06u\n\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec));
+
+    uint16_t *dev_x;
+    uint16_t *dev_res;
+    cudaMalloc( (void**)&dev_x, MEMSIZE );
+    cudaMalloc( (void**)&dev_res, sizeof(int) );
+    cudaMalloc( (void**)&dev_A, MEMSIZE );
+    cudaMemset( dev_A, 1, MEMSIZE);
+    cudaMemset( dev_x, 2, MEMSIZE);
+    printf("kernel launch w/ parameteters (%d iterations)\n", iterations);
+    gettimeofday(&begin, NULL);
+    for (int i=0; i != iterations; i++) {
+        kernel<<<dimGrid, dimBlock>>>(dev_A, dev_x, dev_res, 0, 0, 0, 0);
+    }
+    cudaDeviceSynchronize();
+    gettimeofday(&end, NULL);
+    result = cudaGetLastError();
+    printf("\nresult: %d\n", result);
+    printf("TOTALTIME: %0u.%06u\n\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec));
+
+    return 0;
+}